From ba9393ea71ff8007908fe0f37860e85fdde1cab4 Mon Sep 17 00:00:00 2001
From: David Butcher <dbutcher@arm.com>
Date: Thu, 1 Mar 2012 12:06:36 +0000
Subject: [PATCH] Initial Release

---
 .gitattributes                            |    2 +
 .gitignore                                |   22 +
 Android.mk                                |   97 ++
 LICENSE                                   |  202 ++++
 Makefile                                  |   43 +
 NOTICE                                    |    4 +
 README.txt                                |   42 +
 ReleaseNote.txt                           |  151 +++
 USAGE.txt                                 |  100 ++
 cleanall.sh                               |   38 +
 doxy.conf                                 | 1673 +++++++++++++++++++++++++++++
 getlog.sh                                 |   94 ++
 headers/NE10_random.h                     |  201 ++++
 headers/NE10header.s                      |   27 +
 headers/factor.h                          |  724 +++++++++++++
 headers/macros.h                          |  223 ++++
 headers/unit_test_abs_operation_x.h       |  224 ++++
 headers/unit_test_common.h                |  179 +++
 headers/unit_test_len_operation_x.h       |  226 ++++
 headers/unit_test_mla_operation_x.h       |  242 +++++
 headers/unit_test_mlac_operation_x.h      |  267 +++++
 headers/unit_test_normalize_operation_x.h |  227 ++++
 headers/unit_test_setc_operation_x.h      |  245 +++++
 headers/unit_test_x_operation_x.h         |  233 ++++
 headers/unit_test_xc_operation_x.h        |  254 +++++
 headers/versionheader.h                   |   31 +
 headers/versionheader.s                   |   33 +
 inc/NE10.h                                |  495 +++++++++
 inc/NE10_asm.h                            |  204 ++++
 inc/NE10_c.h                              |  202 ++++
 inc/NE10_neon.h                           |  204 ++++
 inc/NE10_types.h                          |   94 ++
 nightly.pl                                |   90 ++
 projectfile                               |   15 +
 removetabs.sh                             |   55 +
 review.sh                                 |   45 +
 runperf.sh                                |   66 ++
 source/NE10_abs.asm.s                     |   61 ++
 source/NE10_abs.c                         |   65 ++
 source/NE10_abs.neon.s                    |  419 ++++++++
 source/NE10_abs_test.c                    |   73 ++
 source/NE10_add.asm.s                     |   61 ++
 source/NE10_add.c                         |   32 +
 source/NE10_add.neon.c                    |   35 +
 source/NE10_add_test.c                    |   45 +
 source/NE10_addc.asm.s                    |  234 ++++
 source/NE10_addc.c                        |   62 ++
 source/NE10_addc.neon.c                   |   68 ++
 source/NE10_addc_test.c                   |   57 +
 source/NE10_div.asm.s                     |   61 ++
 source/NE10_div.c                         |   32 +
 source/NE10_div.neon.c                    |   46 +
 source/NE10_div_test.c                    |   45 +
 source/NE10_divc.asm.s                    |  233 ++++
 source/NE10_divc.c                        |   62 ++
 source/NE10_divc.neon.c                   |  116 ++
 source/NE10_divc_test.c                   |   57 +
 source/NE10_len.asm.s                     |  139 +++
 source/NE10_len.c                         |   56 +
 source/NE10_len.neon.s                    |  354 ++++++
 source/NE10_len_test.c                    |   53 +
 source/NE10_mla.asm.s                     |   67 ++
 source/NE10_mla.c                         |   32 +
 source/NE10_mla.neon.c                    |   35 +
 source/NE10_mla_test.c                    |   45 +
 source/NE10_mlac.asm.s                    |  259 +++++
 source/NE10_mlac.c                        |   62 ++
 source/NE10_mlac.neon.c                   |   68 ++
 source/NE10_mlac_test.c                   |   57 +
 source/NE10_mul.asm.s                     |   61 ++
 source/NE10_mul.c                         |   62 ++
 source/NE10_mul.neon.s                    |  470 ++++++++
 source/NE10_mul_test.c                    |   73 ++
 source/NE10_mulc.asm.s                    |  233 ++++
 source/NE10_mulc.c                        |   62 ++
 source/NE10_mulc.neon.c                   |   68 ++
 source/NE10_mulc_test.c                   |   57 +
 source/NE10_normalize.asm.s               |  149 +++
 source/NE10_normalize.c                   |   74 ++
 source/NE10_normalize.neon.s              |  397 +++++++
 source/NE10_normalize_test.c              |   66 ++
 source/NE10_rsbc.asm.s                    |  234 ++++
 source/NE10_rsbc.c                        |   62 ++
 source/NE10_rsbc.neon.c                   |   68 ++
 source/NE10_rsbc_test.c                   |   75 ++
 source/NE10_setc.asm.s                    |  178 +++
 source/NE10_setc.c                        |   62 ++
 source/NE10_setc.neon.c                   |   64 ++
 source/NE10_setc_test.c                   |   75 ++
 source/NE10_sub.asm.s                     |   61 ++
 source/NE10_sub.c                         |   32 +
 source/NE10_sub.neon.c                    |   35 +
 source/NE10_sub_test.c                    |   51 +
 source/NE10_subc.asm.s                    |  233 ++++
 source/NE10_subc.c                        |   62 ++
 source/NE10_subc.neon.c                   |   68 ++
 source/NE10_subc_test.c                   |   75 ++
 97 files changed, 13572 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 .gitignore
 create mode 100644 Android.mk
 create mode 100644 LICENSE
 create mode 100644 Makefile
 create mode 100644 NOTICE
 create mode 100644 README.txt
 create mode 100644 ReleaseNote.txt
 create mode 100644 USAGE.txt
 create mode 100755 cleanall.sh
 create mode 100644 doxy.conf
 create mode 100755 getlog.sh
 create mode 100644 headers/NE10_random.h
 create mode 100644 headers/NE10header.s
 create mode 100644 headers/factor.h
 create mode 100644 headers/macros.h
 create mode 100644 headers/unit_test_abs_operation_x.h
 create mode 100644 headers/unit_test_common.h
 create mode 100644 headers/unit_test_len_operation_x.h
 create mode 100644 headers/unit_test_mla_operation_x.h
 create mode 100644 headers/unit_test_mlac_operation_x.h
 create mode 100644 headers/unit_test_normalize_operation_x.h
 create mode 100644 headers/unit_test_setc_operation_x.h
 create mode 100644 headers/unit_test_x_operation_x.h
 create mode 100644 headers/unit_test_xc_operation_x.h
 create mode 100644 headers/versionheader.h
 create mode 100644 headers/versionheader.s
 create mode 100644 inc/NE10.h
 create mode 100644 inc/NE10_asm.h
 create mode 100644 inc/NE10_c.h
 create mode 100644 inc/NE10_neon.h
 create mode 100644 inc/NE10_types.h
 create mode 100755 nightly.pl
 create mode 100644 projectfile
 create mode 100755 removetabs.sh
 create mode 100755 review.sh
 create mode 100755 runperf.sh
 create mode 100644 source/NE10_abs.asm.s
 create mode 100644 source/NE10_abs.c
 create mode 100644 source/NE10_abs.neon.s
 create mode 100644 source/NE10_abs_test.c
 create mode 100644 source/NE10_add.asm.s
 create mode 100644 source/NE10_add.c
 create mode 100644 source/NE10_add.neon.c
 create mode 100644 source/NE10_add_test.c
 create mode 100644 source/NE10_addc.asm.s
 create mode 100644 source/NE10_addc.c
 create mode 100644 source/NE10_addc.neon.c
 create mode 100644 source/NE10_addc_test.c
 create mode 100644 source/NE10_div.asm.s
 create mode 100644 source/NE10_div.c
 create mode 100644 source/NE10_div.neon.c
 create mode 100644 source/NE10_div_test.c
 create mode 100644 source/NE10_divc.asm.s
 create mode 100644 source/NE10_divc.c
 create mode 100644 source/NE10_divc.neon.c
 create mode 100644 source/NE10_divc_test.c
 create mode 100644 source/NE10_len.asm.s
 create mode 100644 source/NE10_len.c
 create mode 100644 source/NE10_len.neon.s
 create mode 100644 source/NE10_len_test.c
 create mode 100644 source/NE10_mla.asm.s
 create mode 100644 source/NE10_mla.c
 create mode 100644 source/NE10_mla.neon.c
 create mode 100644 source/NE10_mla_test.c
 create mode 100644 source/NE10_mlac.asm.s
 create mode 100644 source/NE10_mlac.c
 create mode 100644 source/NE10_mlac.neon.c
 create mode 100644 source/NE10_mlac_test.c
 create mode 100644 source/NE10_mul.asm.s
 create mode 100644 source/NE10_mul.c
 create mode 100644 source/NE10_mul.neon.s
 create mode 100644 source/NE10_mul_test.c
 create mode 100644 source/NE10_mulc.asm.s
 create mode 100644 source/NE10_mulc.c
 create mode 100644 source/NE10_mulc.neon.c
 create mode 100644 source/NE10_mulc_test.c
 create mode 100644 source/NE10_normalize.asm.s
 create mode 100644 source/NE10_normalize.c
 create mode 100644 source/NE10_normalize.neon.s
 create mode 100644 source/NE10_normalize_test.c
 create mode 100644 source/NE10_rsbc.asm.s
 create mode 100644 source/NE10_rsbc.c
 create mode 100644 source/NE10_rsbc.neon.c
 create mode 100644 source/NE10_rsbc_test.c
 create mode 100644 source/NE10_setc.asm.s
 create mode 100644 source/NE10_setc.c
 create mode 100644 source/NE10_setc.neon.c
 create mode 100644 source/NE10_setc_test.c
 create mode 100644 source/NE10_sub.asm.s
 create mode 100644 source/NE10_sub.c
 create mode 100644 source/NE10_sub.neon.c
 create mode 100644 source/NE10_sub_test.c
 create mode 100644 source/NE10_subc.asm.s
 create mode 100644 source/NE10_subc.c
 create mode 100644 source/NE10_subc.neon.c
 create mode 100644 source/NE10_subc_test.c

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..59befaa
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+temp/* -diff
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2a4fc68
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+# ignore executables
+*.ex
+
+# ignore binary/compiled files
+*.[oa]
+
+# ignore temporary files
+*~
+
+# ignore vim files
+.*.swp
+
+# ignore output text files
+testlog.txt
+res_*.txt
+
+# Release files
+release_*
+release_*/*
+NE10_*.tgz
+docs/*
+doc/*
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..a252399
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,97 @@
+LOCAL_PATH:= $(call my-dir)
+
+ne10_neon_source := \
+    source/NE10_setc.neon.c \
+    source/NE10_mulc.neon.c \
+    source/NE10_mla.neon.c \
+    source/NE10_subc.neon.c \
+    source/NE10_addc.neon.c \
+    source/NE10_normalize.neon.s \
+    source/NE10_mlac.neon.c \
+    source/NE10_abs.neon.c \
+    source/NE10_div.neon.c \
+    source/NE10_add.neon.c \
+    source/NE10_divc.neon.c \
+    source/NE10_mul.neon.c \
+    source/NE10_len.neon.s \
+    source/NE10_sub.neon.c \
+    source/NE10_rsbc.neon.c \
+
+ne10_source_files := \
+    source/NE10_subc.c \
+    source/NE10_add.asm.s \
+    source/NE10_rsbc.asm.s \
+    source/NE10_addc.c \
+    source/NE10_setc.c \
+    source/NE10_subc.asm.s \
+    source/NE10_rsbc.c \
+    source/NE10_mla.asm.s \
+    source/NE10_mlac.c \
+    source/NE10_setc.asm.s \
+    source/NE10_mul.asm.s \
+    source/NE10_addc.asm.s \
+    source/NE10_mul.c \
+    source/NE10_mulc.c \
+    source/NE10_mulc.asm.s \
+    source/NE10_mla.c \
+    source/NE10_mlac.asm.s \
+    source/NE10_div.asm.s \
+    source/NE10_div.c \
+    source/NE10_normalize.asm.s \
+    source/NE10_len.c \
+    source/NE10_len.asm.s \
+    source/NE10_abs.asm.s \
+    source/NE10_sub.c \
+    source/NE10_abs.c \
+    source/NE10_add.c \
+    source/NE10_divc.asm.s \
+    source/NE10_divc.c \
+    source/NE10_sub.asm.s \
+    source/NE10_normalize.c 
+
+include $(CLEAR_VARS)
+
+LOCAL_C_INCLUDES :=     $(LOCAL_PATH)/headers/ \
+                        $(LOCAL_PATH)/inc 
+
+LOCAL_SRC_FILES :=  \
+    $(ne10_source_files)
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += $(ne10_neon_source)
+endif
+
+LOCAL_CFLAGS := -D_ARM_ASSEM_
+
+LOCAL_ARM_MODE := arm
+
+LOCAL_MODULE_TAGS := eng
+LOCAL_MODULE := libne10
+
+include $(BUILD_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_CFLAGS := -D_ARM_ASSEM_
+
+LOCAL_ARM_MODE := arm
+
+LOCAL_C_INCLUDES :=     $(LOCAL_PATH)/headers/ \
+                        $(LOCAL_PATH)/inc 
+
+LOCAL_SRC_FILES :=  \
+    $(ne10_source_files)
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += $(ne10_neon_source)
+endif
+
+LOCAL_MODULE_TAGS := eng
+LOCAL_MODULE := libne10
+
+include $(BUILD_SHARED_LIBRARY)
+
+
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..57bc88a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4629871
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,43 @@
+
+C_TOOL    = gcc
+EXE_TOOL    = gcc
+ASM_TOOL    = as
+
+#BJ_FLAGS   = -mthumb-interwork -march=armv7-a -mcpu=cortex-a9 -mfpu=vfp3
+ARM_FLAGS   = -mthumb-interwork -march=armv7-a -mcpu=cortex-a9
+C_FLAGS     = -lm -lrt -I./inc/
+#DEBUG_FLAGS = -gstabs
+OPTIMIZE_FLAGS = -O3
+# -save-temps -O3
+
+LDFLAGS+=-L.  -L/usr/local/lib -L/client/lib -L/lib/arm-linux-gnueabi
+LDFLAGS+=-lm
+
+#TARGET_ARCH = stdc
+
+.PHONY: all clean
+
+all : NE10_addc.test_r.ex
+
+clean:
+	./cleanall.sh
+
+%.test_r.ex : %.asm_r.o %.c_r.o %.neon_r.o ./source/%_test.c  ./inc/NE10.h
+		$(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) $^ -o $@ $(C_FLAGS) -L/lib/arm-linux-gnueabi
+ 
+%.c_r.o : ./source/%.c ./inc/NE10.h
+		$(C_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) -mfpu=vfp3 -c $< -o $@ $(C_FLAGS) -L/lib/arm-linux-gnueabi 
+
+%.asm_r.o : ./source/%.asm.s
+		$(ASM_TOOL) $(ARM_FLAGS) -mfpu=vfp3 $< -o $@
+
+# Either use the C version or use the Assembly version for compiling the NEON routines
+
+# Rules for the Assembly version
+%.neon_r.o : ./source/%.neon.s
+		$(ASM_TOOL) $(ARM_FLAGS) -mfpu=neon $< -o $@
+
+# Rules for the C version
+%.neon_r.o : ./source/%.neon.c ./inc/NE10.h
+		$(C_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) -mfpu=neon -c $< -o $@ $(C_FLAGS)
+
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..f092387
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,4 @@
+NE10 Library
+Copyright 2011-12 ARM Limited
+
+This product was produced by ARM Limited.
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..25afad4
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,42 @@
+Copyright 2011-12 ARM Limited
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+NE10
+====
+
+The objective of this library is to provide a set of common, useful functions
+which have been heavily optimised for ARM, provide consistent well tested
+behaviour and that can be easily incorporated into applications.
+
+The primary API provided is C. The primary OS targeted is Android, although
+the library is tested with Linaro Linux.
+
+The design is intended to be available as a simple 'drop and go' pre-built
+library and as a set of modular functions that can be incorporated in a more
+modular pick and mix form where binary size might be an issue.
+
+Future releases are intended to expand on the functions provided and possibly
+the supported languages (C++ being near the top of that list).
+
+Licensed under the Apache License, Version 2.0
+
+(See LICENSE for details)
+
+Usage
+=====
+
+See USAGE.txt file
+
+
diff --git a/ReleaseNote.txt b/ReleaseNote.txt
new file mode 100644
index 0000000..d10501c
--- /dev/null
+++ b/ReleaseNote.txt
@@ -0,0 +1,151 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : ReleaseNote.txt
+ */
+NE10 SIMD LIBRARY - Release Note
+LAST UPDATED ON: 22 / DEC / 2011
+
+
+
+========
+Contents
+========
+
+   1. Preface
+         1-a. License
+         1-b. Product status
+   2. Release details
+         2-a. Product release status
+         2-b. Functions included
+         2-c. Test cases and results
+   3. Installation
+         3-a. Requirements
+         3-b. Alternative Approach
+
+
+
+==========
+1. Preface
+==========
+
+1-a. License details
+--------------------
+NE10 is an open source project, free software released under the Apache License,
+Version 2.0 (ALv2). It is free, and the 'source code' behind the program is
+available. See the file LICENSE for the full text of the ALv2.
+
+
+1-b. Product status
+-------------------
+This is the first publicly available version of NE10. This open source project
+is actively under development and more functions as well as improved versions of
+the available functions will be contributed to the source code.
+
+
+
+==================
+2. Release details
+==================
+
+2-a. Product release status
+---------------------------
+Version 1.0
+
+2-b. Functions included
+---------------------------
+NE10 is a software library that provides Linux and Android support for Single
+Instruction Multiple Data (SIMD) functionality. In this release, a number of
+mathematical functions (mainly vector and scalar operations) have been
+implemented for the ARM v7 instruction set architecture as well as ARM NEON
+SIMD architecture extensions.
+
+This library has been developed and tested on the following processors:
+
+  1) ARM Cortex-A9 with NEON extension
+  2) ARM Cortex-A8 with NEON extension
+
+The following is a list of currently available functions.
+
+  a) Vector-Constant Arithmetic
+
+   addc_float, addc_vec2f, addc_vec3f, addc_vec4f,
+   subc_float, subc_vec2f, subc_vec3f, subc_vec4f,
+   rsbc_float, rsbc_vec2f, rsbc_vec3f, rsbc_vec4f,
+   mulc_float, mulc_vec2f, mulc_vec3f, mulc_vec4f,
+   divc_float, divc_vec2f, divc_vec3f, divc_vec4f,
+   setc_float, setc_vec2f, setc_vec3f, setc_vec4f,
+   mlac_float, mlac_vec2f, mlac_vec3f, mlac_vec4f
+
+  b) Arithmetic functions over arrays of cst values:
+
+   add_float, sub_float, mul_float, div_float, mla_float, abs_float
+
+  c) Operations on Vectors:
+
+   len_vec2f, len_vec3f, len_vec4f
+   normalize_vec2f, normalize_vec3f, normalize_vec4f
+
+
+2-c. Test cases and results
+---------------------------
+The provided functions are categorized according to the operations that they
+perform.  Functions in each of these categories accept different types of input
+data. Each set is accompanied with a unit test. These unit tests are provided
+as part of this library and can be used to verify and benchmark these functions
+on a target platform.
+
+
+
+===============
+3. Installation
+===============
+
+3-a. Requirements
+-----------------
+This release has been built and tested on the following host environments:
+
+ 1) ARM Versatile Express / Linux linaro 2.6.38-1003
+ 2) BeagleBoard RevC / Linux linaro-developer 3.1.0-4
+ 3) Android AOSP Emulator / Android Open Source Project Toolchain
+
+
+The source code has been successfully built with the following toolchains:
+
+ 1) GCC v4.6.1
+ 2) Prebuilt GCC toolchain provided with ICS release of Android
+
+To build natively on ARM
+
+    ./nightly.pl
+
+Will build and run a set of tests
+
+To build as part of the Android Open Source Project, copy the release
+directory into 'external' within the source directories and build as
+normal.  This will install the libne10.so library into system/lib on the
+final Android OS image, where other applications will be able to access it in
+a similar way to other shared libraries.
+
+
+3-b. Alternative Approach
+-------------------------
+While not supported, the functions within this library can be taken and
+incorporated (licensing conflicts permitting) within other projects as is.
+Details of how to do this are too project specific to detail here.
+
+
diff --git a/USAGE.txt b/USAGE.txt
new file mode 100644
index 0000000..8ae6741
--- /dev/null
+++ b/USAGE.txt
@@ -0,0 +1,100 @@
+    /*
+     *  Copyright 2011-12 ARM Limited
+     *
+     *  Licensed under the Apache License, Version 2.0 (the "License");
+     *  you may not use this file except in compliance with the License.
+     *  You may obtain a copy of the License at
+     *
+     *      http://www.apache.org/licenses/LICENSE-2.0
+     *
+     *  Unless required by applicable law or agreed to in writing, software
+     *  distributed under the License is distributed on an "AS IS" BASIS,
+     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     *  See the License for the specific language governing permissions and
+     *  limitations under the License.
+     */
+
+
+#Contents
+
+#NE10 Usage
+##Using NE10
+
+#NE10 Usage
+
+This file explains use of NE10 library.
+
+#Contents
+##Using NE10
+###General Notes
+###C Bindings
+###Future Bindings
+
+#Using NE10
+
+NE10 is implemented in a mix of C, intrinsics and assembler, however all
+functions are exposed as C.  It can be used as a shared or static library and
+individual functions can be safely excluded from a build to reduce final
+binary size.
+
+##General Notes
+
+The type checking is relaxed, to enable compatiblity with any pre-existing or
+prevailing system of types a project might have.  The debug version of the
+library will check the ranges passed in a call conform to the API limitations.
+The production version avoids these checks for performance reasons.
+
+It is assumed that the ranges of input arrays to be processed do not overlap.
+Clean handling of overlapping arrays is not designed for or tested.  It is
+possible for source and destination pointers to be the same, or for you to pass
+in pointers inside the same array *as long as the regions indicated by
+pointer+length do not overlap*.  Incorrect usage will typically result in an assert
+in debug builds and variable and inaccurate results in production builds.
+
+##C Bindings
+
+The C bindings (available in inc/NE10.h) aim for a balance between simple to
+use and efficient from a execution perspective.  They are intended to be usable
+in C and C++ code, or in theory, in any other language with a well constructed
+mechanism for calling out to C code.
+
+The calls themselves are listed in inc/NE10.h, however depending on your
+circumstances - for example knowing that you are only going to be executing
+code on platforms with NEON available, then you could use the inc/NE10_neon.h
+include file and access those functions directly.
+
+Usage of all the functions is generally consistent, and function specific
+differences documented in the header, but here is an example as a taste:
+
+    arm_vec3f_t *destination;
+    arm_vec3f_t *source1;
+    arm_vec3f_t *source2;
+    int feedback;
+
+    /* Fill your arrays with interesting vector data.. */
+      ...
+    
+    /* Normalize the vectors in source1, returning the result in place */
+    feedback = normalize_vec3f(source1, source1);
+    if (feedback = <check error code>) {
+	   printf("Bad Thing happened normalizing!\n");
+	}
+	/* Multiply source1 by source2, returning the result in destination */
+	feedback = mul_vec3f(destination, source1, source2);
+    if (feedback = <check error code>) {
+	   printf("Bad Thing happened multiplying!\n");
+	}
+
+While the functions all return an integer value to indicate success or failure,
+in practice almost none of the functions currently implemented can 'fail' in that
+way, however future functions may. This is to allow for a more consistent interface
+across the API in the future.
+
+##Future Bindings
+
+We hope to to add C++ bindings at a later date, based on feedback on the most
+appropriate way to provide that sort of API.  Other languages will be
+considered, however the priority will be to improve the scope and performance
+of functions provided under the existing bindings.
+
+
diff --git a/cleanall.sh b/cleanall.sh
new file mode 100755
index 0000000..fb72c51
--- /dev/null
+++ b/cleanall.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+#
+#  Copyright 2011-12 ARM Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# NE10 Library : cleanall.sh
+#
+
+PRODUCT_NAME=NE10
+
+rm *.ex *.a *.o
+rm res_*.txt
+rm .*.swp
+rm .exp.tmp
+rm testlog.txt
+for dir in `find * -maxdepth 0 -type d -name "${PRODUCT_NAME}_*"`; do rm -rf $dir; done;
+for fl  in `find * -maxdepth 0 -type f -name "${PRODUCT_NAME}_*.tgz"`; do rm -rf $fl; done;
+if [ "$CLS" != "0" ]; then
+ clear
+ echo
+ ls -la --color=auto
+ echo
+fi
+echo
+
diff --git a/doxy.conf b/doxy.conf
new file mode 100644
index 0000000..90f4cd9
--- /dev/null
+++ b/doxy.conf
@@ -0,0 +1,1673 @@
+# Doxyfile 1.7.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = NE10 SIMD LIBRARY
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = SIMD FUNCTION LIBRARY FOR LINUX AND ANDROID
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           = NE10
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = ./docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even if there is only one candidate or it is obvious which candidate to choose by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = ./inc/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = NE10.h
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = .git
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            = copyright_notice
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [0,1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+# Note that a value of 0 will completely suppress the enum values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = YES
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, svg, gif or svg.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/getlog.sh b/getlog.sh
new file mode 100755
index 0000000..72e46d0
--- /dev/null
+++ b/getlog.sh
@@ -0,0 +1,94 @@
+#!/bin/sh
+#
+#  Copyright 2011-12 ARM Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# NE10 Library : getlog.sh
+#
+
+echo "NE10 NIGHTLY BUILD SCRIPT"
+echo "(C) 2011, ARM Ltd."
+date
+
+echo
+echo
+echo -e "\033[4mSYSTEM:\033[0m"
+uname -a
+cat /proc/cpuinfo
+
+echo
+echo
+echo -e "\033[4mINSTALLED TOOLS:\033[0m"
+echo "git:"
+if [ "`which git`" == "" ]; then
+ echo "fatal: 'git' is not installed on this system" 1>&2
+ exit 1
+fi
+git --version | paste -s -d ';' -
+echo
+echo "gcc:"
+if [ "`which gcc`" == "" ]; then
+ echo "fatal: 'gcc' is not installed on this system" 1>&2
+ exit 1
+fi
+gcc --version | paste -s -d ';' -
+echo
+echo "as:"
+if [ "`which as`" == "" ]; then
+ echo "fatal: 'as' is not installed on this system" 1>&2
+ exit 1
+fi
+as --version | paste -s -d ';' -
+echo
+echo "ar:"
+if [ "`which ar`" == "" ]; then
+ echo "fatal: 'ar' is not installed on this system" 1>&2
+ exit 1
+fi
+ar --version | paste -s -d ';' -
+echo
+echo
+echo "perl:"
+if [ "`which perl`" == "" ]; then
+ echo "fatal: 'perl' is not installed on this system" 1>&2
+ exit 1
+fi
+perl --version | paste -s -d ';' -
+
+echo
+echo
+echo -e "\033[4mCURRENT 'git' CONFIGURATION:\033[0m"
+git config -l
+
+echo
+echo
+echo -e "\033[4mCURRENT USER AND PATH:\033[0m"
+echo `whoami` "@" `pwd`
+
+echo
+echo
+echo -e "\033[4mENVIRONMENT VARIABLES:\033[0m"
+echo
+echo "PATH = " $PATH
+echo
+echo "LD_LIBRARY_PATH = " $LD_LIBRARY_PATH
+
+
+echo
+echo
+echo -e "\033[4mCURRENT GIT/SOURCE STATUS:\033[0m"
+git show
+
diff --git a/headers/NE10_random.h b/headers/NE10_random.h
new file mode 100644
index 0000000..9253312
--- /dev/null
+++ b/headers/NE10_random.h
@@ -0,0 +1,201 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/NE10_random.h
+ */
+
+
+#ifndef NE10_RANDOM
+#define NE10_RANDOM
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <float.h>
+
+// Please look at http://en.wikipedia.org/wiki/Linear_congruential_generator
+// According to this page, these values are the ones used in "glibc"
+
+  //uint32_t _M                 = 4294967296L; // 2^32 // modulus, must be 0 < _M
+  const uint32_t _A     = 1103515245L; // a, must be 0 < _A < _M
+  const uint32_t _C     = 12345L; // c, must be 0 < _C < _M
+  // uint32_t m_X_NM1           = 0; // X(n-1), at first this value is the seed or the start value
+
+// used for creating different instances of random number generators with different seeds and states
+typedef struct
+{
+  // these are used as internal values, please do not change them directly
+  uint32_t _private_m_A              ;// = 1103515245L; // a, must be 0 < _A < _M
+  uint32_t _private_m_C              ;// = 12345L; // c, must be 0 < _C < _M
+  uint32_t _private_m_X_NM1          ;// = 0; // X(n-1), at first this value is the seed or the start value
+} NE10_rng_t;
+
+typedef struct
+{
+  // these are used as internal values, please do not change them directly
+  // there are three separate seeds for 1) the sign, 2) the exponent, 3) and the fraction bits.
+  NE10_rng_t _private_m_rngs[3];
+} NE10_float_rng_t;
+
+
+// generic functions
+void NE10_rng_init_g(NE10_rng_t *rng, uint32_t seed)
+{
+  assert( rng != NULL );
+  rng->_private_m_A = _A;
+  rng->_private_m_C = _C;
+  rng->_private_m_X_NM1 = seed;
+}
+
+uint32_t NE10_rng_next_g(NE10_rng_t *rng)
+{
+  assert( rng != NULL );
+  // Linear Congruential Generator
+  rng->_private_m_X_NM1 = ( rng->_private_m_A * rng->_private_m_X_NM1 + rng->_private_m_C ); // % _M; // excluded by the nature of using a 32-bit data type
+  return rng->_private_m_X_NM1;
+}
+
+const uint32_t NE10_rng_max_g(NE10_rng_t *rng)
+{
+  return 0xffffffff; // this is 2^32 - 1
+}
+
+
+
+// the same functions using a rng which is shared across the library
+static NE10_rng_t __NE10_rng; // used as the global random number generator shared across the library
+
+void NE10_rng_init(uint32_t seed)
+{
+   NE10_rng_init_g( &__NE10_rng, seed );
+}
+
+uint32_t NE10_rng_next()
+{
+   return NE10_rng_next_g( &__NE10_rng );
+}
+
+const uint32_t NE10_rng_max()
+{
+  return NE10_rng_max_g(NULL);
+}
+
+
+
+// a random number generator that generates IEEE 754 float numbers
+
+// NAN_OR_INF is to check whether the value is a NAN or an INF
+#define NAN_OR_INF (0xFF << 23)
+#define IS_NAN_OR_INF(x) ( ((x & NAN_OR_INF) == NAN_OR_INF)?1:0 )
+
+#define EXPONENT_MASK 0x807FFFFF
+#define IS_SUBNORMAL(x) ( ((x & EXPONENT_MASK) == x)?1:0 )
+
+void NE10_float_rng_init_g(NE10_float_rng_t* float_rng, uint32_t seed)
+{
+   // we can use [0] for the fraction, [1] for the exponent, and [2] for the sign bit
+
+   NE10_rng_t seed_generator;
+   NE10_rng_init_g( &seed_generator, seed );
+
+   NE10_rng_init_g( &float_rng->_private_m_rngs[0], NE10_rng_next_g( &seed_generator ) );
+   NE10_rng_init_g( &float_rng->_private_m_rngs[1], NE10_rng_next_g( &seed_generator ) );
+   NE10_rng_init_g( &float_rng->_private_m_rngs[2], NE10_rng_next_g( &seed_generator ) );
+}
+
+float NE10_float_rng_next_g(NE10_float_rng_t* float_rng)
+{
+   uint32_t frc, exp, sgn, ret;
+   float __ret;
+
+   do
+   {
+      // generate three random numbers
+      frc = NE10_rng_next_g( &float_rng->_private_m_rngs[0] );
+      exp = NE10_rng_next_g( &float_rng->_private_m_rngs[1] );
+      sgn = NE10_rng_next_g( &float_rng->_private_m_rngs[2] );
+
+      // take the top bits ( the sign uses the 17th bit)
+      frc =   ( frc >> 9  ) & 0x7FFFFF        ; // (1)b^23
+      exp = ( ( exp >> 24 ) & 0x0000FF ) << 23; // (1)b^ 8
+      sgn = ( ( sgn >> 16 ) & 0x000001 ) << 31;
+
+      // generate the final float value
+      ret = frc | exp | sgn;
+
+   } while ( IS_NAN_OR_INF(ret) || IS_SUBNORMAL(ret) );
+
+   memcpy( &__ret, &ret, 1*sizeof(float) );
+   return __ret;
+}
+
+float NE10_float_rng_max_g(NE10_float_rng_t* float_rng)
+{
+  return FLT_MAX;
+}
+
+
+// the same functions using a float_rng which is shared across the library
+
+static NE10_float_rng_t __NE10_float_rng; // local array for internal use only
+
+void NE10_float_rng_init(uint32_t seed)
+{
+  NE10_float_rng_init_g( &__NE10_float_rng , seed );
+}
+
+float NE10_float_rng_next()
+{
+   return NE10_float_rng_next_g( &__NE10_float_rng );
+}
+
+float NE10_float_rng_max()
+{
+  return NE10_float_rng_max_g(NULL);
+}
+
+// the same as above functions except the range of values are limited
+
+#define IS_TOO_SMALL(f) ((f<1.0e-6)?1:0)
+#define   IS_TOO_BIG(f) ((f>1.0e12)?1:0)
+
+static NE10_float_rng_t __NE10_float_rng_limit; // local array for internal use only
+
+void NE10_float_rng_limit_init(uint32_t seed)
+{
+   NE10_float_rng_init_g( &__NE10_float_rng_limit , seed );
+}
+
+float NE10_float_rng_limit_next()
+{
+   float ret = 0.0f;
+
+   do
+   {
+      ret = NE10_float_rng_next_g( &__NE10_float_rng_limit );
+   } while ( IS_TOO_SMALL(ret) || IS_TOO_BIG(ret) );
+
+   return ret;
+}
+
+float NE10_float_rng_limit_max()
+{
+  return NE10_float_rng_max_g(NULL);
+}
+
+#endif // NE10_RANDOM
+
diff --git a/headers/NE10header.s b/headers/NE10header.s
new file mode 100644
index 0000000..589f605
--- /dev/null
+++ b/headers/NE10header.s
@@ -0,0 +1,27 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : headers/NE10header.s
+@
+
+.include "headers/versionheader.s"
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ constant values that are used across the library
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        .equ      NE10_OK,        0
+        .equ      NE10_ERR,      -1
diff --git a/headers/factor.h b/headers/factor.h
new file mode 100644
index 0000000..046f148
--- /dev/null
+++ b/headers/factor.h
@@ -0,0 +1,724 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/factor.h
+ */
+
+// Typebuilding MACROs
+#define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
+    { x1,y1,x2,y2,x3,y3 }
+
+// Unit test use this macro to index into their function table
+// "opc" stands for operation's code (which function),
+// and "imp" stands for implementation (which implementation of the function)
+#define FTBL_IDX(opc, imp) ((opc-1)*IMPL_COUNT+(imp-1))
+
+// This macro helps measure the performance of the code passed to it through the "code" argument
+// It is used in the unit tests
+#define MEASURE(res, code) \
+   { \
+    gettimeofday (&before, &zone); \
+      code \
+    gettimeofday (&after, &zone); \
+    if (before.tv_usec > after.tv_usec) \
+    { \
+      after.tv_usec += 1000000; \
+      after.tv_sec--; \
+    } \
+    lapsed.tv_usec = after.tv_usec - before.tv_usec; \
+    lapsed.tv_sec  = after.tv_sec  - before.tv_sec; \
+    res = lapsed.tv_sec + ((double)lapsed.tv_usec / 1000000.0); \
+   }
+
+// There are several categories of functions that share common code:
+
+// Different groups of functions take different number of inputs
+//
+// Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
+// Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
+// Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
+//
+// Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
+// Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
+// Group 6 = These take a dst, and a src ("DstSrc")
+//
+
+// The naming convention used in the following macros is as follows:
+//   SNAPP_<A>_OPERATION_<T>_<I>
+//   where
+//   <A> Stands for the title of the operation (add, mul, etc) followed by its type (C = const as in addc).
+//       The letter X - if used - means any such operation.
+//   <T> Indicates the type of the operation (float, vec2, etc.)
+//       The letter X - is used - means any type.
+//   <I> This indicates the implementation (it can be C, ASM, or NEON).
+
+// A few macros to check pointers and their address range to make sure there's
+//  no unwanted overlap between any two of them
+#define NE10_CHECKPOINTER_DstSrcCst_OPERATION \
+   if ( dst < src ) \
+    { assert ( dst + count <= src ); } \
+   else if ( dst > src ) \
+    { assert ( src + count <= dst ); }
+
+#define NE10_CHECKPOINTER_DstSrc_OPERATION NE10_CHECKPOINTER_DstSrcCst_OPERATION
+
+#define NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
+   if ( arg1 < arg2 ) \
+    { assert ( arg1 + count <= arg2 ); } \
+   else if ( arg1 > arg2 ) \
+    { assert ( arg2 + count <= arg1 ); } \
+   if ( arg1 < arg3 ) \
+    { assert ( arg1 + count <= arg3 ); } \
+   else if ( arg1 > arg3 ) \
+    { assert ( arg3 + count <= arg1 ); } \
+   if ( arg3 < arg2 ) \
+    { assert ( arg3 + count <= arg2 ); } \
+   else if ( arg3 > arg2 ) \
+    { assert ( arg2 + count <= arg3 ); }
+
+#define NE10_CHECKPOINTER_4POINTER_OPERATION(arg1, arg2, arg3, arg4) \
+   NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
+   if ( arg1 < arg4 ) \
+    { assert ( arg1 + count <= arg4 ); } \
+   else if ( arg1 > arg4 ) \
+    { assert ( arg4 + count <= arg1 ); } \
+   if ( arg2 < arg4 ) \
+    { assert ( arg2 + count <= arg4 ); } \
+   else if ( arg2 > arg4 ) \
+    { assert ( arg4 + count <= arg2 ); } \
+   if ( arg4 < arg3 ) \
+    { assert ( arg4 + count <= arg3 ); } \
+   else if ( arg4 > arg3 ) \
+    { assert ( arg3 + count <= arg4 ); }
+
+
+
+#define NE10_CHECKPOINTER_DstAccSrcCst_OPERATION { \
+   NE10_CHECKPOINTER_3POINTER_OPERATION(dst, acc, src); }
+
+#define NE10_CHECKPOINTER_DstCst_OPERATION  {}
+
+#define NE10_CHECKPOINTER_DstSrc1Src2_OPERATION { \
+   NE10_CHECKPOINTER_3POINTER_OPERATION(dst, src1, src2); }
+
+#define NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION { \
+   NE10_CHECKPOINTER_4POINTER_OPERATION(dst, acc, src1, src2); }
+
+// These macros generalise implementation of the functions.
+
+// Macros used in C implementations
+#define NE10_TEMPLATE_XC_OPERATION_X_C(checkPointer, loopCode) { \
+   arm_result_t res = NE10_OK; \
+   unsigned int itr = 0; \
+   checkPointer; \
+   for ( itr = 0; itr < count; itr++ ) \
+   { loopCode ; /* this loop iterates through each and every float item one at a time */ \
+   } \
+   return res; \
+  }
+
+// macros used in the NEON implementations
+
+// Main Loop = The loop where the number of items to be processed is exactly the
+//              number that we can process in a single iteration.
+//
+// Secondary Loop = The loop that follows a Main Loop to fill in the entries that
+//                   did not fit into the Main Loop. This is needed when the number of
+//                   input items is not a multiple of the number of items that we
+//                   process in every iteration of the Main Loop.
+
+
+/****************************************************
+ *                                                  *
+ *  The "DstSrcCst" group of functions              *
+ *                                                  *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
+     /* load 4 values  */ \
+     n_src = vld1q_f32( (float32_t*)src ); \
+     src += 4; /* move to the next 4 float items; 4*float */ \
+     loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
+     dst += 4; /* move to the next items; 4*float */ \
+    }
+
+#define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
+      float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+      float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
+      n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d0 */ \
+      loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+      vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+      /* move to the next item in the stream */ \
+      src++; \
+      dst++; \
+     }
+
+#define NE10_DstSrcCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_src; \
+   float32x4_t n_dst; \
+   checkPointer; \
+   int dif = 0; \
+   dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
+   for (; count > dif; count -= 4) { \
+     loopCode1; \
+    } \
+   if ( 0 != dif ) { \
+    unsigned int idx; \
+    for ( idx = 0 ; idx < dif; idx++ ) { \
+      loopCode2; \
+     } \
+    } \
+   return res; \
+  }
+
+///// - VEC2F - /////
+
+#define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
+     n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
+     src += 2; /* move to the next two vectors */ \
+     loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
+     dst += 2; /* move to the next 2 vectors */ \
+    }
+
+#define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
+     float32x2_t n_tmp_src; \
+     float32x2_t n_tmp_cst = { cst->x, cst->y }; \
+     n_tmp_src = vld1_f32( (float32_t*)src  ); \
+     loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+     vst1_f32( (float32_t*)dst, n_tmp_src); \
+    }
+
+#define NE10_DstSrcCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
+   float32x4_t n_src; \
+   float32x4_t n_dst; \
+   checkPointer; \
+   int dif = count % 2; \
+   for (; count > dif; count -= 2) { \
+    loopCode1; \
+   } \
+   if ( 0 != dif ) { \
+    loopCode2; \
+   } \
+   return res; \
+  }
+
+///// - VEC3F - /////
+
+#define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
+     n_src1 = vld1q_f32( (float32_t*)src ); \
+     src = ((void*)src)+(4*sizeof(arm_float_t)); \
+     n_src2 = vld1q_f32( (float32_t*)src ); \
+     src = ((void*)src)+(4*sizeof(arm_float_t)); \
+     n_src3 = vld1q_f32( (float32_t*)src ); \
+     src = ((void*)src)+(4*sizeof(arm_float_t)); \
+     loopCode; /* The main loop iterates through three 3D vectors each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+     vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+     vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+  }
+
+#define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
+      float32x2x3_t n_tmp_src = FLOAT32_2x3( \
+        0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
+      float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
+        cst->x, 0, cst->y, 0, cst->z, 0); \
+      n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
+      loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
+      vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
+      src++; \
+      dst++; \
+     }
+
+#define NE10_DstSrcCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
+   float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
+   float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
+    float32x4_t n_src1, n_src2, n_src3; \
+   float32x4_t n_dst1, n_dst2, n_dst3; \
+   checkPointer; \
+   int dif = count % 4;  \
+   for (; count > dif; count -= 4) { \
+    loopCode1; \
+  } \
+  if ( 0 != dif ) { \
+    unsigned int idx; \
+    for ( idx = 0 ; idx < dif; idx++ ) { \
+      loopCode2; \
+     } \
+    } \
+   return res; \
+  }
+
+///// - VEC4F - /////
+
+/* Note that for the VEC4* types, we do not need a second loop as the number
+    of input items is always a multiple of four. */
+
+#define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
+     n_src = vld1q_f32( (float32_t*)src ); \
+     src ++; \
+     loopCode; \
+     vst1q_f32 ( (float32_t*)dst , n_dst );  /* The main loop iterates through one 4D vector each time */ \
+     dst ++; \
+   }
+
+#define NE10_DstSrcCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
+   float32x4_t n_src; \
+   float32x4_t n_dst; \
+   checkPointer; \
+   for (; count != 0; count --) { \
+     loopCode; \
+    } \
+   return res; \
+  }
+
+/****************************************************
+ *                                                  *
+ *  The "DstAccSrcCst" group of functions           *
+ *                                                  *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
+     /* load 4 values  */ \
+     n_acc = vld1q_f32( (float32_t*)acc ); \
+     n_src = vld1q_f32( (float32_t*)src ); \
+     acc += 4; /* move to the next 4 float items; 4*float */ \
+     src += 4; \
+     loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
+     dst += 4; /* move to the next items; 4*float */ \
+    }
+
+#define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
+      float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+      float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+      float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
+      n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
+      n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d1 */ \
+      loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+      vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+      /* move to the next item in the stream */ \
+      acc++; \
+      src++; \
+      dst++; \
+     }
+
+#define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON    NE10_DstSrcCst_OPERATION_FLOAT_NEON
+
+///// - VEC2F - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
+     n_acc = vld1q_f32( (float32_t*)acc ); /* load two vectors */ \
+     n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
+     acc += 2; /* move to the next two vectors */ \
+     src += 2; \
+     loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
+     dst += 2; /* move to the next 2 vectors */ \
+    }
+
+#define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
+     float32x2_t n_tmp_acc; \
+     float32x2_t n_tmp_src; \
+     float32x2_t n_tmp_cst = { cst->x, cst->y }; \
+     n_tmp_acc = vld1_f32( (float32_t*)acc  ); \
+     n_tmp_src = vld1_f32( (float32_t*)src  ); \
+     loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+     vst1_f32( (float32_t*)dst, n_tmp_src); \
+    }
+
+#define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON    NE10_DstSrcCst_OPERATION_VEC2F_NEON
+
+///// - VEC3F - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
+     n_acc1 = vld1q_f32( (float32_t*)acc ); /* Load accumulator values */ \
+     acc = ((void*)acc)+(4*sizeof(arm_float_t)); \
+     n_acc2 = vld1q_f32( (float32_t*)acc ); \
+     acc = ((void*)acc)+(4*sizeof(arm_float_t)); \
+     n_acc3 = vld1q_f32( (float32_t*)acc ); \
+     acc = ((void*)acc)+(4*sizeof(arm_float_t)); \
+     n_src1 = vld1q_f32( (float32_t*)src ); /* Load source values */ \
+     src = ((void*)src)+(4*sizeof(arm_float_t)); \
+     n_src2 = vld1q_f32( (float32_t*)src ); \
+     src = ((void*)src)+(4*sizeof(arm_float_t)); \
+     n_src3 = vld1q_f32( (float32_t*)src ); \
+     src = ((void*)src)+(4*sizeof(arm_float_t)); \
+     loopCode; /* The main loop iterates through three 3D vectors each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst1 ); /* Store the results back into the memory */ \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+     vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+     vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+  }
+
+#define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
+      float32x2x3_t n_tmp_acc = FLOAT32_2x3( \
+         0.0f, 0.0f, \
+         0.0f, 0.0f, \
+         0.0f, 0.0f  \
+      ); \
+      float32x2x3_t n_tmp_src = FLOAT32_2x3( \
+        0.0f, 0.0f, \
+        0.0f, 0.0f, \
+        0.0f, 0.0f  \
+      ); \
+      float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
+         cst->x, 0, \
+         cst->y, 0, \
+         cst->z, 0 \
+      ); \
+      n_tmp_acc = vld3_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); \
+      n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
+      loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
+      vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
+      acc++; \
+      src++; \
+      dst++; \
+  }
+
+#define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON    NE10_DstSrcCst_OPERATION_VEC3F_NEON
+
+///// - VEC4F - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
+     n_acc = vld1q_f32( (float32_t*)acc ); \
+     n_src = vld1q_f32( (float32_t*)src ); \
+     acc ++; \
+     src ++; \
+     loopCode; \
+     vst1q_f32 ( (float32_t*)dst , n_dst );  /* The main loop iterates through one 4D vector each time */ \
+     dst ++; \
+  }
+
+#define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON    NE10_DstSrcCst_OPERATION_VEC4F_NEON
+
+/****************************************************
+ *                                                  *
+ *  The "DstCst" group of functions                 *
+ *                                                  *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
+     /* load 4 values  */ \
+     loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_cst ); /* store theresults back */ \
+     dst += 4; /* move to the next items; 4*float */ \
+    }
+
+#define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
+      float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
+      loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+      vst1_lane_f32( (float32_t*)dst, n_tmp_cst, 0); /* store the lane back into the memory */ \
+      /* move to the next item in the stream */ \
+      dst++; \
+     }
+
+#define NE10_DstCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   checkPointer; \
+   int dif = 0; \
+   dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
+   for (; count > dif; count -= 4) { \
+     loopCode1; \
+    } \
+   if ( 0 != dif ) { \
+    unsigned int idx; \
+    for ( idx = 0 ; idx < dif; idx++ ) { \
+      loopCode2; \
+     } \
+    } \
+   return res; \
+  }
+
+///// - VEC2F - /////
+
+
+#define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
+     loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_cst ); /* store back */ \
+     dst += 2; /* move to the next 2 vectors */ \
+    }
+
+#define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
+     float32x2_t n_tmp_cst = { cst->x, cst->y }; \
+     loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+     vst1_f32( (float32_t*)dst, n_tmp_cst); \
+    }
+
+#define NE10_DstCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
+   checkPointer; \
+   int dif = count % 2; \
+   for (; count > dif; count -= 2) { \
+    loopCode1; \
+   } \
+   if ( 0 != dif ) { \
+    loopCode2; \
+   } \
+   return res; \
+  }
+
+///// - VEC3F - /////
+
+#define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
+     loopCode; /* The main loop iterates through three 3D vectors each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+     vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+     vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
+     dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
+  }
+
+#define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
+      float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
+        cst->x, 0, \
+        cst->y, 0, \
+        cst->z, 0 \
+      ); \
+      loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
+      vst3_lane_f32( (float32_t*)dst, n_tmp_cst, 0); \
+      dst++; \
+     }
+
+#define NE10_DstCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
+   float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
+   float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
+   checkPointer; \
+   int dif = count % 4;  \
+   for (; count > dif; count -= 4) { \
+    loopCode1; \
+  } \
+  if ( 0 != dif ) { \
+    unsigned int idx; \
+    for ( idx = 0 ; idx < dif; idx++ ) { \
+      loopCode2; \
+     } \
+    } \
+   return res; \
+  }
+
+///// - VEC4F - /////
+
+#define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
+     loopCode; \
+     vst1q_f32 ( (float32_t*)dst , n_cst );  /* The main loop iterates through one 4D vector each time */ \
+     dst ++; \
+   }
+
+#define NE10_DstCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
+   checkPointer; \
+   for (; count != 0; count --) { \
+     loopCode; \
+    } \
+   return res; \
+  }
+
+/****************************************************
+ *                                                  *
+ *  The "DstSrc1Src2" group of functions            *
+ *                                                  *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
+     /* load 4 values  */ \
+     n_src = vld1q_f32( (float32_t*)src1 ); \
+     src1 += 4; /* move to the next 4 float items; 4*float */ \
+     n_src2 = vld1q_f32( (float32_t*)src2 ); \
+     src2 += 4; /* move to the next 4 float items; 4*float */ \
+     loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
+     dst += 4; /* move to the next items; 4*float */ \
+    }
+
+#define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
+      float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+      float32x2_t n_tmp_src2 = { 0.0f , 0.0f }; \
+      n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d0 */ \
+      n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src, 0); \
+      loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+      vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+      /* move to the next item in the stream */ \
+      src1++; \
+      src2++; \
+      dst++; \
+     }
+
+#define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
+
+/****************************************************
+ *                                                  *
+ *  The "DstAccSrc1Src2" group of functions         *
+ *                                                  *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
+     /* load 4 values  */ \
+     n_acc = vld1q_f32( (float32_t*)acc ); \
+     n_src = vld1q_f32( (float32_t*)src1 ); \
+     n_src2 = vld1q_f32( (float32_t*)src2 ); \
+     acc += 4; /* move to the next 4 float items; 4*float */ \
+     src1 += 4; \
+     src2 += 4; \
+     loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+     vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
+     dst += 4; /* move to the next items; 4*float */ \
+    }
+
+#define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
+      float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+      float32x2_t n_tmp_src = { 0.0f , 0.0f }; \
+      float32x2_t n_tmp_src2 = { 0.0f, 0.0f }; \
+      n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
+      n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d1 */ \
+      n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src2, 0); /* load into the first lane of d2 */ \
+      loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+      vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+      /* move to the next item in the stream */ \
+      acc++; \
+      src1++; \
+      src2++; \
+      dst++; \
+     }
+
+#define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
+
+/****************************************************
+ *                                                  *
+ *  The "DstSrc" group of functions                 *
+ *                                                  *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstSrc_MAINLOOP_FLOAT_NEON NE10_DstSrcCst_MAINLOOP_FLOAT_NEON
+
+#define NE10_DstSrc_SECONDLOOP_FLOAT_NEON NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON
+
+#define NE10_DstSrc_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
+
+///// - VEC2F - /////
+
+#define NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode) { \
+     n_src = vld2_f32( (float32_t*)src ); /* load two vectors */ \
+     src += 2; /* move to the next two vectors */ \
+     loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+     /* store the results and increment the destination pointer within the loopCode */ \
+    }
+
+#define NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode) { \
+     loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+     /* store the results within the loopCode */ \
+    }
+
+#define NE10_DstSrc_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x2x2_t n_src; \
+   float32x2_t n_dst; \
+   checkPointer; \
+   int dif = count % 2; \
+   for (; count > dif; count -= 2) { \
+    loopCode1; \
+   } \
+   if ( 0 != dif ) { \
+    loopCode2; \
+   } \
+   return res; \
+  }
+
+///// - VEC3F - /////
+
+#define NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode) { \
+     n_src = vld3q_f32( (float32_t*)src ); \
+     src = ((void*)src)+(12*sizeof(arm_float_t)); \
+     loopCode; /* The main loop iterates through four 3D vectors each time */ \
+     /* store the results and increment the destination pointer within the loopCode */ \
+  }
+
+#define NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode) { \
+      loopCode; /* exceptional cases where the count isn't a multiple of 4 */ \
+      /* store the results within the loopCode */ \
+     }
+
+#define NE10_DstSrc_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
+   arm_result_t res = NE10_OK; \
+   float32x4x3_t n_src; \
+   float32x4_t n_dst; \
+   checkPointer; \
+   int dif = count % 4; \
+   for (; count > dif; count -= 4) { \
+    loopCode1; \
+   } \
+   if ( 0 != dif ) { \
+     unsigned int idx; \
+     for ( idx = 0 ; idx < dif; idx++ ) { \
+       loopCode2; \
+      } \
+     } \
+    return res; \
+   }
+
+///// - VEC4F - /////
+
+/* Note that for the VEC4* types, we do not need a second loop as the number
+    of input items is always a multiple of four. */
+
+#define NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) { \
+     n_src = vld1q_f32( (float32_t*)src ); \
+     src ++; \
+     loopCode; \
+     /* store the results and increment the destination pointer within the loopCode */ \
+   }
+
+#define NE10_DstSrc_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
+   arm_result_t res = NE10_OK; \
+   float32x4_t n_src; \
+   checkPointer; \
+   for (; count != 0; count --) { \
+     loopCode; \
+    } \
+   return res; \
+  }
+
diff --git a/headers/macros.h b/headers/macros.h
new file mode 100644
index 0000000..a8ae6d3
--- /dev/null
+++ b/headers/macros.h
@@ -0,0 +1,223 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/macros.h
+ */
+
+#include "factor.h"
+
+// Macros used in actual implementations
+
+///// The "DstSrcCst" group of functions - FLOAT /////
+
+#define NE10_XC_OPERATION_X_C(loopCode) { \
+   NE10_TEMPLATE_XC_OPERATION_X_C( \
+      NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+      loopCode); \
+  }
+
+#define NE10_XC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_cst = { cst, cst, cst, cst }; \
+   NE10_DstSrcCst_OPERATION_FLOAT_NEON(  \
+    NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+    NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
+    NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_XC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+   NE10_DstSrcCst_OPERATION_VEC2F_NEON(  \
+    NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+    NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
+    NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
+   ); \
+  }
+
+/* This macro uses interleaving to boost the performance */
+#define NE10_XC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+   NE10_DstSrcCst_OPERATION_VEC3F_NEON(  \
+    NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+    NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
+    NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_XC_OPERATION_VEC4F_NEON(loopCode) { \
+   NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
+    NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+    NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
+   ); \
+  }
+
+///// The "DstAccSrcCst" group of functions - FLOAT //////
+
+#define NE10_MLAC_OPERATION_X_C(loopCode) { \
+   NE10_TEMPLATE_XC_OPERATION_X_C( \
+    NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+    loopCode); \
+ }
+
+#define NE10_MLAC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_acc; \
+   float32x4_t n_cst = { cst, cst, cst, cst }; \
+   NE10_DstAccSrcCst_OPERATION_FLOAT_NEON(  \
+    NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+    NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
+    NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_MLAC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_acc; \
+   NE10_DstAccSrcCst_OPERATION_VEC2F_NEON(  \
+    NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+    NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
+    NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_MLAC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_acc1, n_acc2, n_acc3; \
+   NE10_DstAccSrcCst_OPERATION_VEC3F_NEON(  \
+    NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+    NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
+    NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_MLAC_OPERATION_VEC4F_NEON(loopCode) { \
+   float32x4_t n_acc; \
+   NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
+    NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+    NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
+   ); \
+  }
+
+///// The "DstCst" group of functions - FLOAT /////
+
+#define NE10_SETC_OPERATION_X_C(loopCode) { \
+   NE10_TEMPLATE_XC_OPERATION_X_C( \
+    NE10_CHECKPOINTER_DstCst_OPERATION; , \
+    loopCode); \
+  }
+
+#define NE10_SETC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_cst = { cst, cst, cst, cst }; \
+   NE10_DstCst_OPERATION_FLOAT_NEON(  \
+    NE10_CHECKPOINTER_DstCst_OPERATION; , \
+    NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
+    NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_SETC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+   NE10_DstCst_OPERATION_VEC2F_NEON(  \
+    NE10_CHECKPOINTER_DstCst_OPERATION; , \
+    NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
+    NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
+   ); \
+  }
+
+/* This macro uses interleaving to boost the performance */
+#define NE10_SETC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+   NE10_DstCst_OPERATION_VEC3F_NEON(  \
+    NE10_CHECKPOINTER_DstCst_OPERATION; , \
+    NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
+    NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_SETC_OPERATION_VEC4F_NEON(loopCode) { \
+   NE10_DstCst_OPERATION_VEC4F_NEON( \
+    NE10_CHECKPOINTER_DstCst_OPERATION; , \
+    NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
+   ); \
+  }
+
+///// The "DstSrc1Src2" group of functions //////
+
+#define NE10_X_OPERATION_FLOAT_C(loopCode) { \
+   NE10_TEMPLATE_XC_OPERATION_X_C( \
+      NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
+      loopCode); \
+  }
+
+#define NE10_X_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_src2; \
+   NE10_DstSrc1Src2_OPERATION_FLOAT_NEON(  \
+    NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
+    NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
+    NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
+   ); \
+  }
+
+///// The "DstSrc" group of functions //////
+
+#define NE10_ABS_OPERATION_X_C(loopCode) { \
+   NE10_TEMPLATE_XC_OPERATION_X_C( \
+    NE10_CHECKPOINTER_DstSrc_OPERATION, \
+    loopCode); \
+  }
+
+#define NE10_ABS_OPERATION_FLOAT_C NE10_ABS_OPERATION_X_C
+
+#define NE10_ABS_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+   arm_float_t cst = 0.0f; /* this is used to compare the values against. */ \
+   float32x4_t n_cst = { cst, cst, cst, cst }; \
+   NE10_DstSrc_OPERATION_FLOAT_NEON(  \
+    NE10_CHECKPOINTER_DstSrc_OPERATION; , \
+    NE10_DstSrc_MAINLOOP_FLOAT_NEON(loopCode1); , \
+    NE10_DstSrc_SECONDLOOP_FLOAT_NEON(loopCode2); \
+   ); \
+  }
+
+#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
+#define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+    NE10_DstSrc_OPERATION_VEC2F_NEON( \
+     NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
+     NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode1), \
+     NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode2) \
+     ); \
+  }
+
+#define NE10_LEN_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+    NE10_DstSrc_OPERATION_VEC3F_NEON( \
+     NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
+     NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode1), \
+     NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode2) \
+     ); \
+  }
+
+#define NE10_LEN_OPERATION_VEC4F_NEON(loopCode) { \
+    NE10_DstSrc_OPERATION_VEC4F_NEON( \
+     NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
+     NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) \
+     ); \
+  }
+
+///// The "DstAccSrc1Src2" group of functions //////
+
+#define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+   float32x4_t n_acc; \
+   float32x4_t n_src2; \
+   NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON(  \
+    NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION; , \
+    NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
+    NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
+   ); \
+  }
diff --git a/headers/unit_test_abs_operation_x.h b/headers/unit_test_abs_operation_x.h
new file mode 100644
index 0000000..93d511c
--- /dev/null
+++ b/headers/unit_test_abs_operation_x.h
@@ -0,0 +1,224 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_abs_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
+arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+    FILL_FLOAT_ARRAY( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
+
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
+            }
+           );
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source file
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ACCEPTABLE_ERROR ) )
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_src );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_common.h b/headers/unit_test_common.h
new file mode 100644
index 0000000..4648170
--- /dev/null
+++ b/headers/unit_test_common.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_common.h
+ */
+
+#ifndef __UNIT_TEST_COMMON__
+#define __UNIT_TEST_COMMON__
+
+// Make sure the following values are defined before including this header file:
+// 1- length of the data arrays
+//     #define ARRLEN
+// 2- number of the operations in a given unit
+//     #define OP_COUNT
+// 3- number of the different implementations of each of the functions (C, ASM, NEON, ...)
+//     #define IMPL_COUNT
+#ifndef ARRLEN
+ #error Pelease define ARRLEN
+#endif
+#ifndef OP_COUNT
+ #error Please define OP_COUNT
+#endif
+#ifndef IMPL_COUNT
+ #error Please define IMPL_COUNT
+#endif
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "../headers/macros.h"
+#include "NE10.h"
+#include "../headers/NE10_random.h"
+
+// length of the test data arrays
+// A number that is not divisible by 2 3 and 4 so that all the
+//  execution paths are tested; The larger the number the more
+//  number of random values are stored in the array and passed
+//  into the array as the input stream.
+// 2^11 + 3 = 2051, it is not divisible by 2, 3, or 4
+#define TEST_ARRLEN 2051
+
+// NAN_OR_INF is to check whether the value is a NAN or an INF
+#define NAN_OR_INF (0xFF << 23)
+// The sign bit mask
+#define SIGNBIT_MASK  0x7FFFFFFF
+#define EXPONENT_MASK 0x807FFFFF
+
+// What's the acceptable error between the integer representations of two float values
+#define ACCEPTABLE_ERROR 2
+
+// What's the acceptable number of warnings in a test
+#define ACCEPTABLE_WARNS 10
+
+inline void FILL_FLOAT_ARRAY( float *arr, unsigned int count )
+{
+    unsigned int i = 0;
+
+    sleep ( 1 );
+
+    NE10_float_rng_init( time(NULL) );
+
+    for ( i = 0; i < count; i++ )
+    {
+      arr[i] = NE10_float_rng_next();
+    }
+}
+
+inline void FILL_FLOAT_ARRAY_LIMIT( float *arr, unsigned int count )
+{
+    unsigned int i = 0;
+
+    sleep ( 1 );
+
+    NE10_float_rng_limit_init( time(NULL) );
+
+    for ( i = 0; i < count; i++ )
+    {
+      arr[i] = NE10_float_rng_limit_next();
+    }
+}
+
+// this function checks whether the difference between two float values is within the acceptable error range
+inline int EQUALS_FLOAT( float fa, float fb , unsigned int err )
+{
+  union
+  {
+    int          vi;
+    float        vf;
+  } conv1, conv2;
+
+  if ( fa == fb ) return 1; // if identical, then return TRUE
+
+  conv1.vf = fa;
+  conv2.vf = fb;
+
+  if ( (conv1.vi & NAN_OR_INF) == NAN_OR_INF ) { return 0; } // INF or NAN, unacceptable return FALSE
+  if ( (conv2.vi & NAN_OR_INF) == NAN_OR_INF ) { return 0; } // INF or NAN, unacceptable return FALSE
+
+  int cut1 = conv1.vi & SIGNBIT_MASK; // drop the sign bit - i.e. the left most bit
+  int cut2 = conv2.vi & SIGNBIT_MASK;
+
+  if ( (cut1 & EXPONENT_MASK) == cut1 ) { cut1 = 0; } // zero out subnormal float values
+  if ( (cut2 & EXPONENT_MASK) == cut2 ) { cut2 = 0; } // zero out subnormal float values
+
+  if ( abs( cut1 - cut2 ) > err ) // this is the log() of the actual error
+  {  // then we have an unacceptable error
+
+     // report an unaaceptable error
+     unsigned int ui1, ui2;
+
+     memcpy( &ui1,  &fa, sizeof(float) );
+     memcpy( &ui2,  &fb, sizeof(float) );
+
+     fprintf( stderr, "HINT: %e (0x%04X) != %e (0x%04X) ", fa, ui1, fb, ui2 );
+
+     return 0;
+  }
+
+  return 1; // acceptable, return TRUE
+}
+
+char ARRAY_GUARD_SIG[] = {      0x66, 0xAB, 0xCD, 0xAB,
+                                0xCD, 0xAB, 0xCD, 0xAB,
+                                0xCD, 0xAB, 0xCD, 0xAB,
+                                0xCD, 0xAB, 0xCD, 0x99 };
+#define ARRAY_GUARD_LEN 16
+
+// this function adds a ARRAY_GUARD_LEN byte signature to the begining and the end of an array, minimum acceptable size for the array is 2*ARRAY_GUARD_LEN bytes.
+inline int GUARD_ARRAY( void* array, unsigned int array_length )
+{
+  char* the_array = (char*) array;
+  if ( array_length < (2*ARRAY_GUARD_LEN) ) return 0;
+  memcpy( the_array, ARRAY_GUARD_SIG, ARRAY_GUARD_LEN );
+  memcpy( &the_array[array_length-ARRAY_GUARD_LEN], ARRAY_GUARD_SIG, ARRAY_GUARD_LEN );
+  return 1;
+}
+
+// this function returns TRUE if the signature matches the guard and returns FALSE otherwise
+inline int CHECK_ARRAY_GUARD( void* array, unsigned int array_length )
+{
+  char* the_array = (char*) array;
+  if ( strncmp(the_array, ARRAY_GUARD_SIG, ARRAY_GUARD_LEN) ) {
+      fprintf( stderr, " ERROR: Array guard signature is wrong. \n" );
+      return 0; // Match not found, return FALSE
+  }
+
+  if ( strncmp(&the_array[array_length-ARRAY_GUARD_LEN], ARRAY_GUARD_SIG, ARRAY_GUARD_LEN)  ) {
+      fprintf( stderr, " ERROR: Array guard signature is wrong. \n" );
+      return 0; // Match not found, return FALSE
+  }
+
+  return 1;
+}
+
+#endif // __UNIT_TEST_COMMON
+
diff --git a/headers/unit_test_len_operation_x.h b/headers/unit_test_len_operation_x.h
new file mode 100644
index 0000000..ebc8b13
--- /dev/null
+++ b/headers/unit_test_len_operation_x.h
@@ -0,0 +1,226 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_len_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_3args_t)(arm_float_t * dst, void * src, unsigned int count);
+arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+    FILL_FLOAT_ARRAY_LIMIT( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
+
+  // actual test
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
+            }
+           );
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source files
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = 1; // LEN() is always a scala
+
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , 0xFF ) ) // accept larger errors as we're doing a single step
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_src );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_mla_operation_x.h b/headers/unit_test_mla_operation_x.h
new file mode 100644
index 0000000..77bf46a
--- /dev/null
+++ b/headers/unit_test_mla_operation_x.h
@@ -0,0 +1,242 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_mla_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_5args_t)(void * dst, void * acc, void * src1, void * src2, unsigned int count);
+arm_func_5args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_acc = NULL;
+arm_float_t * guarded_src1 = NULL;
+arm_float_t * guarded_src2 = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * theacc = NULL;
+arm_float_t * thesrc1 = NULL;
+arm_float_t * thesrc2 = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_acc = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length );
+    theacc = (arm_float_t*) ( (void*)guarded_acc + 16);
+    FILL_FLOAT_ARRAY( theacc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
+    FILL_FLOAT_ARRAY( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
+    FILL_FLOAT_ARRAY( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc1, thesrc2, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc1, thesrc2, 0 );
+
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , theacc, thesrc1, thesrc2, ARRLEN );
+            }
+           );
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source file
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_acc );
+  free( guarded_src1 );
+  free( guarded_src2 );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_mlac_operation_x.h b/headers/unit_test_mlac_operation_x.h
new file mode 100644
index 0000000..71333b8
--- /dev/null
+++ b/headers/unit_test_mlac_operation_x.h
@@ -0,0 +1,267 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_mlac_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_5args_t)(void * dst, void * acc, void * src, const void * cst, unsigned int count);
+arm_func_5args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_cst = NULL;
+arm_float_t * guarded_acc = NULL;
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thecst = NULL;
+arm_float_t * theacc = NULL;
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
+    thecst[0] = (arm_float_t) 1.4f;
+    thecst[1] = (arm_float_t) 6.2f;
+    thecst[2] = (arm_float_t) 3.3f;
+    thecst[3] = (arm_float_t) 2.5f;
+
+    guarded_acc = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length );
+    theacc = (arm_float_t*) ( (void*)guarded_acc + 16);
+    FILL_FLOAT_ARRAY( theacc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+    FILL_FLOAT_ARRAY( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc, thecst, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc, thecst, 0 );
+
+  // actual test
+  if ( 1 == opcode )
+  {  // in this case the const argument is not a pointer but an actual float value
+     union fp_bitwise {
+           arm_float_t _f;
+           unsigned int _i;
+      } _icst;
+
+          _icst._f = thecst[0];
+
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , theacc, thesrc, (void*)_icst._i, ARRLEN );
+            }
+           );
+  }
+  else
+  {
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , theacc, thesrc, thecst, ARRLEN );
+            }
+           );
+  }
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source file
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+
+  free( guarded_cst );
+  free( guarded_src );
+  free( guarded_acc );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_normalize_operation_x.h b/headers/unit_test_normalize_operation_x.h
new file mode 100644
index 0000000..59b9e36
--- /dev/null
+++ b/headers/unit_test_normalize_operation_x.h
@@ -0,0 +1,227 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_normalize_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
+arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+    FILL_FLOAT_ARRAY_LIMIT( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( elapsed,
+      // call the function
+      ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
+    );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, 0 );
+
+  // actual test
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
+            }
+           );
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source files
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode+1; // 1=vec2, 2=vec3, 3=vec4
+
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , 0xFF ) ) // accept larger errors as we're doing a single step
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_src );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_setc_operation_x.h b/headers/unit_test_setc_operation_x.h
new file mode 100644
index 0000000..85d6519
--- /dev/null
+++ b/headers/unit_test_setc_operation_x.h
@@ -0,0 +1,245 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_setc_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_3args_t)(void * dst, const void * cst, unsigned int count);
+arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_cst = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thecst = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+    guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
+    FILL_FLOAT_ARRAY( thecst, 4 ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, 0 );
+
+  // actual test
+  if ( 1 == opcode )
+  {  // in this case the const argument is not a pointer but an actual float value
+     union fp_bitwise {
+           arm_float_t _f;
+           unsigned int _i;
+      } _icst;
+
+          _icst._f = thecst[0];
+
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , (void*)_icst._i, ARRLEN );
+            }
+           );
+  }
+  else
+  {
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thecst, ARRLEN );
+            }
+           );
+  }
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source file
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_cst );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_x_operation_x.h b/headers/unit_test_x_operation_x.h
new file mode 100644
index 0000000..7c6aa46
--- /dev/null
+++ b/headers/unit_test_x_operation_x.h
@@ -0,0 +1,233 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_x_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
+arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_src1 = NULL;
+arm_float_t * guarded_src2 = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thesrc1 = NULL;
+arm_float_t * thesrc2 = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
+    FILL_FLOAT_ARRAY( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
+    FILL_FLOAT_ARRAY( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, 0 );
+
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
+            }
+           );
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test source file
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_src1 );
+  free( guarded_src2 );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/unit_test_xc_operation_x.h b/headers/unit_test_xc_operation_x.h
new file mode 100644
index 0000000..6436e21
--- /dev/null
+++ b/headers/unit_test_xc_operation_x.h
@@ -0,0 +1,254 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/unit_test_xc_operation_x.h
+ */
+
+#include "./unit_test_common.h"
+
+// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
+typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src, const void * cst, unsigned int count);
+arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
+
+
+// this function is implemented in the unit test source files
+// it is meant to initialise the function table defined above.
+extern void init_ftbl();
+
+
+unsigned int i = 0;   // loop iterator
+unsigned int max = 0; // number of iterations in each function
+int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
+int impl   = -1; // selects which particular implementation of the chosen operation must run
+int mute   = 0;   // 0 == print output;   1 == do not print anything;
+
+struct timeval  before, after, lapsed, dummy;
+double dt_test_overhead = 0.0;
+double dt_test_sample = 0.0;
+double elapsed = 0.0;
+struct timezone zone;
+
+// there is a max of "4" components in a vec
+#define MAX_VEC_COMPONENTS 4
+
+arm_float_t * guarded_cst = NULL;
+arm_float_t * guarded_src = NULL;
+arm_float_t * guarded_dst[IMPL_COUNT];
+
+arm_float_t * thecst = NULL;
+arm_float_t * thesrc = NULL;
+arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
+int done_init = 0;
+
+arm_result_t test_operation()
+{
+  const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
+
+  // initialize if not done so
+  if ( 0 == done_init )
+  {
+    guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
+    thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
+    FILL_FLOAT_ARRAY( thecst, 4 ); // random initialization
+
+    guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+    GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
+    thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
+    FILL_FLOAT_ARRAY( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
+
+    for ( i = 0; i<IMPL_COUNT; i++ )
+    {
+      guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
+      GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
+      thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
+    }
+
+    done_init = 1;
+  }
+
+  // sample run
+  MEASURE( dt_test_sample,
+    ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, thecst, ARRLEN );
+  );
+  if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) ||
+       ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS) )
+  {
+                fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
+                exit( NE10_ERR );
+  }
+
+  // this test to make sure passing zero as the length won't cause segmentation faults
+  ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, thecst, 0 );
+
+  // actual test
+  if ( 1 == opcode )
+  {  // in this case the const argument is not a pointer but an actual float value
+     union fp_bitwise {
+           arm_float_t _f;
+           unsigned int _i;
+      } _icst;
+
+          _icst._f = thecst[0];
+
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, (void*)_icst._i, ARRLEN );
+            }
+           );
+  }
+  else
+  {
+          MEASURE( elapsed,
+            for ( i = 0; i < max; i++  )
+            {
+               // call the function
+               ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, thecst, ARRLEN );
+            }
+           );
+  }
+
+  if ( !mute )
+       printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
+                              ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead ))  );
+
+ return NE10_OK;
+}
+
+arm_result_t run_test( int argc, char **argv )
+{
+   if ( argc == 2 ) // requesting the number of available operations/routines in this unit
+   {
+      opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
+      if ( opcode == 0 ) return OP_COUNT;
+      exit( NE10_ERR );
+   } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
+   {
+      opcode = atoi ( argv[1] );
+      if ( opcode <= 0 ) exit( NE10_ERR );
+      impl   = atoi ( argv[2] );
+      if ( impl   < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
+      max = atoi ( argv[3] );
+      if ( max <= 0 ) exit( NE10_ERR );
+   } else exit( NE10_ERR );
+
+   // initialize the table with NULL
+   memset( ftbl, 0, sizeof(ftbl));
+
+   // manually initialize the functions which have actual implementations
+   init_ftbl(); // this function is implemented in the unit test file
+
+  if ( opcode <= 0 || opcode > OP_COUNT
+       || impl < 0 || impl > IMPL_COUNT )
+  {
+      fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
+      exit( NE10_ERR );
+  }
+
+  if ( impl == 0 ) // run all implementations and verify
+  {
+      // first, make sure all of the implementations do exist
+      for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
+      {
+        if ( NULL == ftbl[i] )
+        {
+                fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
+                exit( NE10_ERR );
+        }
+      }
+
+      // try all the implementatins here..
+             mute = 1; // do not print anything
+
+             // opcode remains the same but we iterate through different implementations here..
+             for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+             {
+                 test_operation();
+             }
+
+             // now verify
+             arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
+             int warns = 0;
+             int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
+             _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
+             for ( i = 0; i < ARRLEN; i++ )
+             {
+                 for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
+                 {
+                     memcpy ( &_output[  (impl-1) * item_width  ], &thedst[ impl-1 ][ i * item_width ],  sizeof(arm_float_t) * item_width  );
+                 }
+
+                 int pos = 0;
+                 for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
+                 {
+                     for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
+                     {
+                         assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
+                         assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] );  // check for not-a-number
+
+                         if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ACCEPTABLE_ERROR ) )
+                         { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
+                                    opcode, impl, i, pos+1 );
+                             warns++; }
+
+                         // stop after 10 warnings
+                         if ( warns >= ACCEPTABLE_WARNS )
+                         {    fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+                              exit( NE10_ERR );
+                         }
+                     }
+                 }
+             }
+             free( _output ); _output = (arm_float_t *) NULL;
+
+             if ( warns < ACCEPTABLE_WARNS )
+             {
+               return NE10_OK;
+             }
+
+             fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
+             exit( NE10_ERR );
+  }
+  else // run a particular implementation
+  {
+      if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
+
+      // ge the overhead
+      MEASURE( dt_test_overhead,
+               for ( i = 0 ; i < max; i++ )
+               {
+               }
+              );
+
+       test_operation();
+  }
+
+
+
+  // free any allocated memory...
+  free( guarded_cst );
+  free( guarded_src );
+  for ( i = 0; i<IMPL_COUNT; i++ )
+  {
+    free( guarded_dst[i] );
+  }
+
+  return NE10_OK;
+}
diff --git a/headers/versionheader.h b/headers/versionheader.h
new file mode 100644
index 0000000..1e57811
--- /dev/null
+++ b/headers/versionheader.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : headers/versionheader.h
+ */
+
+/////////////////////////////////////////////////////////
+// version information
+/////////////////////////////////////////////////////////
+
+#define VERSION_MAJOR      0
+#define VERSION_MINOR      0
+#define VERSION_REVISION   10
+
+#define PAHSE              1
+#define COPYRIGHT_YEAR     2011
+#define COPYRIGHT_HOLDER   "ARM Ltd."
diff --git a/headers/versionheader.s b/headers/versionheader.s
new file mode 100644
index 0000000..17a6fde
--- /dev/null
+++ b/headers/versionheader.s
@@ -0,0 +1,33 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : headers/versionheader.s
+@
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ version information
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        .equ      VERSION_MAJOR,    0
+        .equ      VERSION_MINOR,    0
+        .equ      VERSION_REVISION, 10
+
+        .equ      PHASE,            1
+        .equ      COPYRIGHT_YEAR,   2011
+
+COPYRIGHT_HOLDER:
+        .asciz                      "ARM Ltd."
diff --git a/inc/NE10.h b/inc/NE10.h
new file mode 100644
index 0000000..9effa19
--- /dev/null
+++ b/inc/NE10.h
@@ -0,0 +1,495 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : inc/NE10.h
+ */
+
+/*! \file NE10.h
+    \brief All NE10 routines declarations.
+
+    The routines that are provided by this library are all declared in this header file.
+ */
+
+#include "../headers/versionheader.h"
+#include <NE10_types.h>
+#include <NE10_c.h>
+#include <NE10_asm.h>
+#include <NE10_neon.h>
+
+#ifndef NE10_H
+#define NE10_H
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+/*!
+    Adds a constant scalar value to all the elements of an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   The constant scalar added to the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*addc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+/*!
+    Adds a constant 2D vector to all of the vectors in an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector added to the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*addc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+/*!
+    Adds a constant 3D vector to all of the vectors in an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 3D vector added to the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*addc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+/*!
+    Adds a constant 4D vector to all of the vectors in an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 4D vector added to the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*addc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+/*!
+    Subtracts a constant scalar from all the elements of an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector added to the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*subc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+/*!
+    Subtracts a constant 2D vector from all of the vectors in an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector subtracted from the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*subc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+/*!
+    Subtracts a constant 3D vector from all of the vectors in an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 3D vector subtracted from the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*subc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+/*!
+    Subtracts a constant 4D vector from all of the vectors in an input array and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 4D vector subtracted from the input values
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*subc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+/*!
+    Subtracts the elements of an input array from a constant scalar and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   The constant scalar to subtract the input values from
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*rsbc_float)(arm_float_t * dst, arm_float_t *src, const arm_float_t cst, unsigned int count);
+/*!
+    Subtracts the vectors in an input array from a constant 2D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector to subtract the input values from
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*rsbc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+/*!
+    Subtracts the vectors in an input array from a constant 3D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 3D vector to subtract the input values from
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*rsbc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+/*!
+    Subtracts the vectors in an input array from a constant 4D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 4D vector to subtract the input values from
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*rsbc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+/*!
+    Multiplies the elements of an input array by a constant scalar and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   The constant scalar to multiply the input values with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mulc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+/*!
+    Multiplies the components of 2D vectors in an input array by the components of a constant 2D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector to multiply the input values with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mulc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+/*!
+    Multiplies the components of 3D vectors in an input array by the components of a constant 3D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 3D vector to multiply the input values with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mulc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+/*!
+    Multiplies the components of 4D vectors in an input array by the components of a constant 4D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 4D vector to multiply the input values with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mulc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+/*!
+    Divides the elements of an input array by a constant scalar and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   The constant scalar to divide the input values by
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*divc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+/*!
+    Divides the components of 2D vectors in an input array with the components of a constant 2D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector to divide the input values by
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*divc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+/*!
+    Divides the components of 3D vectors in an input array with the components of a constant 3D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 3D vector to divide the input values by
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*divc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+/*!
+    Divides the components of 4D vectors in an input array with the components of a constant 4D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 4D vector to divide the input values by
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*divc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+/*!
+    Sets the elements of an input array to a constant scalar and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  cst   The constant scalar to set the input values to
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*setc_float)(arm_float_t * dst, const arm_float_t cst, unsigned int count);
+/*!
+    Sets the components of 2D vectors in an input array to the components of a constant 2D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  cst   Pointer to the 2D vector to set the input values to
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*setc_vec2f)(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+/*!
+    Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  cst   Pointer to the 3D vector to set the input values to
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*setc_vec3f)(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+/*!
+    Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  cst   Pointer to the 4D vector to set the input values to
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*setc_vec4f)(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+/*!
+    Multiplies each entry in the source array (src) by cst, then adds the result to
+     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   The constant scalar to multiply the input elements with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mlac_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
+/*!
+   Multiplies each entry in the source array (src) by the 2D vector cst, then adds the result to
+     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 2D vector to multiply the input vectors with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mlac_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+/*!
+   Multiplies each entry in the source array (src) by the 3D vector cst, then adds the result to
+     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 3D vector to multiply the input vectors with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mlac_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+/*!
+   Multiplies each entry in the source array (src) by the 4D vector cst, then adds the result to
+     the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  acc   The corresponding elemetn is added to the result of the multiplication
+    @param[in]  src   Pointer to the source array
+    @param[in]  cst   Pointer to the 4D vector to multiply the input vectors with
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*mlac_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+// ## Arithmetic functions over arrays of cst values ##
+
+/*!
+    Adds the elements of src1 to the elements of src2 and stores the results in the dst.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1  The first array to use as the input array
+    @param[in]  src2  The second array to use as the input array
+    @param[in]  count The number of items in the two input arrays
+ */
+extern arm_result_t (*add_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+/*!
+    Subtracts the elements of src2 from the elements of src2 and stores the results in the dst.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1  The first array to use as the input array
+    @param[in]  src2  The second array to use as the input array
+    @param[in]  count The number of items in the two input arrays
+ */
+extern arm_result_t (*sub_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+/*!
+    Multiplies the elements of src1 by the elements of src2 and stores the results in the dst.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1  The first array to use as the input array
+    @param[in]  src2  The second array to use as the input array
+    @param[in]  count The number of items in the two input arrays
+ */
+extern arm_result_t (*mul_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+/*!
+    Divides the elements of src1 by the elements of src2 and stores the results in the dst.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src1  The first array to use as the input array
+    @param[in]  src2  The second array to use as the input array
+    @param[in]  count The number of items in the two input arrays
+ */
+extern arm_result_t (*div_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+/*!
+    Performs a multiply and accumulate operation using the corresponding elements in acc, src1, and src2.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  acc   These elemtns are added to the result of the multiplication operation
+    @param[in]  src1  The first array to use as the input array
+    @param[in]  src2  The second array to use as the input array
+    @param[in]  count The number of items in the two input arrays
+ */
+extern arm_result_t (*mla_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+/*!
+    Calculates the absolute value of each element in the source array and stores the result in the corresponding entry of the destination array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*abs_float)(arm_float_t * dst, arm_float_t * src, unsigned int count);
+
+
+
+// ## Operations on Vectors ##
+/*!
+    Returns length of 2D vectors in corresponding elements of the output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*len_vec2f)(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
+/*!
+    Returns length of 3D vectors in corresponding elements of the output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*len_vec3f)(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
+/*!
+    Returns length of 4D vectors in corresponding elements of the output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*len_vec4f)(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+/*!
+    Normalizes 2D vectors of the input array and stores them in the corresponding elements of the output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*normalize_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+/*!
+    Normalizes 3D vectors of the input array and stores them in the corresponding elements of the output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*normalize_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+/*!
+    Normalizes 4D vectors of the input array and stores them in the corresponding elements of the output array.
+    @param[out] dst   Pointer to the destination array
+    @param[in]  src   Pointer to the source array
+    @param[in]  count The number of items in the input array
+ */
+extern arm_result_t (*normalize_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t (*abs_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t (*abs_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t (*abs_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern arm_result_t (*vmul_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t (*vmul_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t (*vmul_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t (*vdiv_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t (*vdiv_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t (*vdiv_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t (*vmla_vec2f)(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t (*vmla_vec3f)(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t (*vmla_vec4f)(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern arm_result_t (*add_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t (*add_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t (*add_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t (*sub_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t (*sub_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t (*sub_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t (*dot_vec2f)(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t (*dot_vec3f)(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t (*dot_vec4f)(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t (*cross_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// arm_mat4x4f_t
+extern arm_result_t (*add_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t (*sub_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t (*mul_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t (*div_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t (*set_mat4x4f)(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t (*add_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t (*sub_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t (*mul_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t (*div_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t (*set_mat3x3f)(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t (*add_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t (*sub_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t (*mul_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t (*div_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t (*set_mat2x2f)(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
+
+
+
+// ## Operations on Matrices ##
+extern arm_result_t (*invert_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t (*det_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t (*trans_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t (*identity_mat4x4f)(arm_mat4x4f_t * dst, unsigned int count);
+
+extern arm_result_t (*invert_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t (*det_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t (*trans_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t (*identity_mat3x3f)(arm_mat3x3f_t * dst, unsigned int count);
+
+extern arm_result_t (*invert_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t (*det_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t (*trans_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t (*identity_mat2x2f)(arm_mat2x2f_t * dst, unsigned int count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern arm_result_t (*trans_mat4x4f_vec4f)(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t (*trans_mat3x3f_vec4f)(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t (*trans_mat2x2f_vec4f)(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
+
+
+
+// ## Matrix-Matrix Algebra ##
+extern arm_result_t (*multrans_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t (*multrans_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t (*multrans_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+
+#endif
diff --git a/inc/NE10_asm.h b/inc/NE10_asm.h
new file mode 100644
index 0000000..ae1ef16
--- /dev/null
+++ b/inc/NE10_asm.h
@@ -0,0 +1,204 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : inc/NE10_asm.h
+ */
+
+#include "../headers/versionheader.h"
+
+#ifndef NE10_ASM_H
+#define NE10_ASM_H
+
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+// ## Vector-Constant Arithmetic ##
+
+extern arm_result_t addc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t addc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t addc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t addc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t subc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract cst from the element(s)
+
+
+
+extern arm_result_t rsbc_float_asm(arm_float_t * dst, arm_float_t *src, const arm_float_t cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t *src, const arm_vec2f_t * cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t *src, const arm_vec3f_t * cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t *src, const arm_vec4f_t * cst, unsigned int count); // subtract element(s) from a cst
+
+
+
+extern arm_result_t mulc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t mulc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t mulc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t mulc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t divc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t divc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t divc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t divc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t setc_float_asm(arm_float_t * dst, const arm_float_t cst, unsigned int count);
+extern arm_result_t setc_vec2f_asm(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t setc_vec3f_asm(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t setc_vec4f_asm(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t mlac_float_asm(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t mlac_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t mlac_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t mlac_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+// ## Arithmetic functions over arrays of cst values ##
+extern arm_result_t add_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t sub_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t mul_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t div_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t mla_float_asm(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t abs_float_asm(arm_float_t * dst, arm_float_t * src, unsigned int count);
+
+// ## Operations on Vectors ##
+extern arm_result_t len_vec2f_asm(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t len_vec3f_asm(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t len_vec4f_asm(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t normalize_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t normalize_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t normalize_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t abs_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t abs_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t abs_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern arm_result_t vmul_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t vdiv_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t vmla_vec2f_asm(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec3f_asm(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec4f_asm(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern arm_result_t add_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t add_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t add_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t sub_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t sub_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t sub_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t dot_vec2f_asm(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t dot_vec3f_asm(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t dot_vec4f_asm(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t cross_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// arm_mat4x4f_t
+extern arm_result_t add_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t sub_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t mul_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t div_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t set_mat4x4f_asm(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t add_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t sub_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t mul_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t div_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t set_mat3x3f_asm(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t add_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t sub_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t mul_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t div_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t set_mat2x2f_asm(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
+
+
+
+// ## Operations on Matrices ##
+extern arm_result_t invert_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t det_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t trans_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t identity_mat4x4f_asm(arm_mat4x4f_t * dst, unsigned int count);
+
+extern arm_result_t invert_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t det_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t trans_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t identity_mat3x3f_asm(arm_mat3x3f_t * dst, unsigned int count);
+
+extern arm_result_t invert_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t det_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t trans_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t identity_mat2x2f_asm(arm_mat2x2f_t * dst, unsigned int count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern arm_result_t trans_mat4x4f_vec4f_asm(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t trans_mat3x3f_vec4f_asm(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t trans_mat2x2f_vec4f_asm(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
+
+
+
+// ## Matrix-Matrix Algebra ##
+extern arm_result_t multrans_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t multrans_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t multrans_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+
+#endif
diff --git a/inc/NE10_c.h b/inc/NE10_c.h
new file mode 100644
index 0000000..2c68fa8
--- /dev/null
+++ b/inc/NE10_c.h
@@ -0,0 +1,202 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : inc/NE10_c.h
+ */
+
+#include "../headers/versionheader.h"
+#include <NE10_types.h>
+
+#ifndef NE10_C_H
+#define NE10_C_H
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+extern arm_result_t addc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t addc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t addc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t addc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t subc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract cst from the element(s)
+
+
+
+extern arm_result_t rsbc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract element(s) from a cst
+
+
+
+extern arm_result_t mulc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t mulc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t mulc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t mulc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t divc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t divc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t divc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t divc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t setc_float_c(arm_float_t * dst, const arm_float_t cst, unsigned int count);
+extern arm_result_t setc_vec2f_c(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t setc_vec3f_c(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t setc_vec4f_c(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t mlac_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t mlac_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t mlac_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t mlac_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+// ## Arithmetic functions over arrays of cst values ##
+extern arm_result_t add_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t sub_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t mul_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t div_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t mla_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t abs_float_c(arm_float_t * dst, arm_float_t * src, unsigned int count);
+
+// ## Operations on Vectors ##
+extern arm_result_t len_vec2f_c(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t len_vec3f_c(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t len_vec4f_c(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t normalize_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t normalize_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t normalize_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t abs_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t abs_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t abs_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern arm_result_t vmul_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t vdiv_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t vmla_vec2f_c(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec3f_c(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec4f_c(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern arm_result_t add_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t add_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t add_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t sub_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t sub_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t sub_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t dot_vec2f_c(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t dot_vec3f_c(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t dot_vec4f_c(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t cross_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// arm_mat4x4f_t
+extern arm_result_t add_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t sub_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t mul_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t div_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t set_mat4x4f_c(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t add_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t sub_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t mul_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t div_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t set_mat3x3f_c(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t add_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t sub_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t mul_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t div_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t set_mat2x2f_c(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
+
+
+
+// ## Operations on Matrices ##
+extern arm_result_t invert_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t det_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t trans_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t identity_mat4x4f_c(arm_mat4x4f_t * dst, unsigned int count);
+
+extern arm_result_t invert_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t det_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t trans_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t identity_mat3x3f_c(arm_mat3x3f_t * dst, unsigned int count);
+
+extern arm_result_t invert_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t det_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t trans_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t identity_mat2x2f_c(arm_mat2x2f_t * dst, unsigned int count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern arm_result_t trans_mat4x4f_vec4f_c(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t trans_mat3x3f_vec4f_c(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t trans_mat2x2f_vec4f_c(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
+
+
+
+// ## Matrix-Matrix Algebra ##
+extern arm_result_t multrans_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t multrans_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t multrans_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+
+#endif
diff --git a/inc/NE10_neon.h b/inc/NE10_neon.h
new file mode 100644
index 0000000..411a659
--- /dev/null
+++ b/inc/NE10_neon.h
@@ -0,0 +1,204 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : inc/NE10_neon.h
+ */
+
+#include "../headers/versionheader.h"
+
+#ifndef NE10_NEON_H
+#define NE10_NEON_H
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+extern arm_result_t addc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t addc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t addc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t addc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t subc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract cst from the element(s)
+extern arm_result_t subc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract cst from the element(s)
+
+
+
+extern arm_result_t rsbc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract element(s) from a cst
+extern arm_result_t rsbc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract element(s) from a cst
+
+
+
+extern arm_result_t mulc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t mulc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t mulc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t mulc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t divc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t divc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t divc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t divc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t setc_float_neon(arm_float_t * dst, const arm_float_t cst, unsigned int count);
+extern arm_result_t setc_vec2f_neon(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t setc_vec3f_neon(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t setc_vec4f_neon(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+extern arm_result_t mlac_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t mlac_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t mlac_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t mlac_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+
+// ## Arithmetic functions over arrays of cst values ##
+extern arm_result_t add_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t sub_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t mul_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t mla_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t abs_float_neon(arm_float_t * dst, arm_float_t * src, unsigned int count);
+
+// ## Operations on Vectors ##
+extern arm_result_t len_vec2f_neon(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t len_vec3f_neon(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t len_vec4f_neon(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t normalize_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t normalize_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t normalize_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+extern arm_result_t abs_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t abs_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t abs_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern arm_result_t vmul_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t vdiv_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vdiv_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t vmla_vec2f_neon(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec3f_neon(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmla_vec4f_neon(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern arm_result_t add_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t add_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t add_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t sub_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t sub_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t sub_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t dot_vec2f_neon(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t dot_vec3f_neon(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t dot_vec4f_neon(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+
+
+extern arm_result_t cross_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// arm_mat4x4f_t
+extern arm_result_t add_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t sub_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t mul_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t div_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t set_mat4x4f_neon(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t add_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t sub_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t mul_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t div_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t set_mat3x3f_neon(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t add_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t sub_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t mul_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t div_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+extern arm_result_t set_mat2x2f_neon(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
+
+
+
+// ## Operations on Matrices ##
+extern arm_result_t invert_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t det_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t trans_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
+extern arm_result_t identity_mat4x4f_neon(arm_mat4x4f_t * dst, unsigned int count);
+
+extern arm_result_t invert_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t det_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t trans_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
+extern arm_result_t identity_mat3x3f_neon(arm_mat3x3f_t * dst, unsigned int count);
+
+extern arm_result_t invert_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t det_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t trans_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
+extern arm_result_t identity_mat2x2f_neon(arm_mat2x2f_t * dst, unsigned int count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern arm_result_t trans_mat4x4f_vec4f_neon(arm_vec4f_t * dst, arm_mat4x4f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t trans_mat3x3f_vec4f_neon(arm_vec4f_t * dst, arm_mat3x3f_t * mat, arm_vec4f_t * vec, unsigned int count);
+extern arm_result_t trans_mat2x2f_vec4f_neon(arm_vec4f_t * dst, arm_mat2x2f_t * mat, arm_vec4f_t * vec, unsigned int count);
+
+
+
+// ## Matrix-Matrix Algebra ##
+extern arm_result_t multrans_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
+extern arm_result_t multrans_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
+extern arm_result_t multrans_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+
+#endif
diff --git a/inc/NE10_types.h b/inc/NE10_types.h
new file mode 100644
index 0000000..0996b7a
--- /dev/null
+++ b/inc/NE10_types.h
@@ -0,0 +1,94 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : inc/NE10_types.h
+ */
+
+/** NE10 defines a number of types for use in its function signatures.
+ *  The types are defined within this header file.
+ */
+
+#ifndef NE10_TYPES_H
+#define NE10_TYPES_H
+
+/////////////////////////////////////////////////////////
+// constant values that are used across the library
+/////////////////////////////////////////////////////////
+#define NE10_OK 0
+#define NE10_ERR -1
+
+/////////////////////////////////////////////////////////
+// some external definitions to be exposed to the users
+/////////////////////////////////////////////////////////
+typedef float arm_float_t;      // a single float value
+typedef int   arm_result_t;     // resulting [error-]code
+
+typedef struct
+{
+        float x;
+        float y;
+} arm_vec2f_t; // a 2-tuple of float values
+
+typedef struct
+{
+        float x;
+        float y;
+        float z;
+} arm_vec3f_t; // a 3-tuple of float values
+
+typedef struct
+{
+        float x;
+        float y;
+        float z;
+        float w;
+} arm_vec4f_t; // a 4-tuple of float values
+
+
+typedef struct { float r1; float r2; } arm_mat_row2f;
+
+typedef struct
+{
+        arm_mat_row2f c1;
+        arm_mat_row2f c2;
+
+} arm_mat2x2f_t;     // a 2x2 matrix
+
+
+typedef struct { float r1; float r2; float r3; }  arm_mat_row3f;
+
+typedef struct
+{
+        arm_mat_row3f c1;
+        arm_mat_row3f c2;
+        arm_mat_row3f c3;
+
+} arm_mat3x3f_t;     // a 3x3 matrix
+
+
+typedef struct { float r1; float r2; float r3; float r4; } arm_mat_row4f;
+
+typedef struct
+{
+        arm_mat_row4f c1;
+        arm_mat_row4f c2;
+        arm_mat_row4f c3;
+        arm_mat_row4f c4;
+
+} arm_mat4x4f_t;     // a 4x4 matrix
+
+#endif
diff --git a/nightly.pl b/nightly.pl
new file mode 100755
index 0000000..ee05abf
--- /dev/null
+++ b/nightly.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+#
+#  Copyright 2011-12 ARM Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# NE10 Library : nightly.pl
+#
+
+use warnings;
+use strict;
+
+# other variables
+my $iterations = 2000; # how many iterations each test must go through ?
+my $files_list=`cat projectfile | tr '\n' ';'`; #`find . -maxdepth 1 -type f -iname *.prj | grep './'`; # units to be built are listed in "projectfile"
+my @files = split( /;/, $files_list ); # array of files to be built
+my @built; # list of successfully built units
+my @failed; # list of units that failed to build
+my @warn; # list of units that did build but returned with an error message or too many warnings
+
+my $units_count = 0;
+my $units_succeeded = 0;
+my $success_percentage = 0;
+
+# get list of units and build them
+foreach my $fl (@files) {
+    $units_count ++;
+    #print "<".$fl.">\n"; # debug print
+    my $make_cmd = "make NE10_$fl.test_r.ex";
+    system ( $make_cmd );
+    if ( $? != 0 )
+    {
+      # failed to build
+      push(@failed, $fl);
+    }
+    else
+    {
+       # built successfully...
+       push(@built, $fl);
+       $units_succeeded ++;
+    }
+
+}
+
+$success_percentage = 100 * $units_succeeded / $units_count;
+
+
+#get a test log to be stored in the "test_index_tbl"
+system ( "./getlog.sh > ./testlog.txt" );
+my $platform = `echo \$NE10PLATFORM`;
+my $syslog = `cat ./testlog.txt`;
+my $testlog; # this will keep the sammary text that will be stored in the database
+my $ACCEPTABLE_WARNS = 10; # note: this is defined in unit_test_common.h
+
+# try and run perf on the successfully built units
+
+foreach my $success (@built)
+{
+    my $perf_cmd = "./runperf.sh NE10_$success $iterations";
+    system ( $perf_cmd );
+    if ( ($? < 0) || ($? > $ACCEPTABLE_WARNS) )
+    {
+      # an error while running the test
+      push(@warn, $success);
+    }
+}
+
+
+ # loop through successfully built units
+ foreach my $unit (@built)
+ {
+  # build a string for all STDOUT text files (test outputs) and STDERR files (perflogs)
+  my $stdout_str = `cat res_std_NE10_${unit}_*_$iterations.txt`;
+  my $stderr_str = `cat res_err_NE10_${unit}_*_$iterations.txt`;
+ }
+
+ # print out a summary of this run
+ print ( $testlog );
diff --git a/projectfile b/projectfile
new file mode 100644
index 0000000..a63bec3
--- /dev/null
+++ b/projectfile
@@ -0,0 +1,15 @@
+addc
+subc
+rsbc
+mulc
+divc
+mlac
+setc
+add
+sub
+mul
+div
+mla
+abs
+len
+normalize
diff --git a/removetabs.sh b/removetabs.sh
new file mode 100755
index 0000000..b1cb022
--- /dev/null
+++ b/removetabs.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+#
+#  Copyright 2011-12 ARM Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# NE10 Library : removetabs.sh
+#
+#!/bin/bash
+
+# This script removes tab characters in files and replaces them with
+#  the right number of spaces. It also removes trailing whitespaces.
+
+# remove trailing whitespaces
+LSw=`grep -lsri --exclude="Makefile" --exclude-dir=".git" '\s$' .`;
+for flw in $LSw
+do
+    echo "HAS SPACES: " $flw; # just to see a list of the files that include unwanted tabs
+    perms=`stat -c '%a' $flw`;
+    sed 's/[ \t]*$//gi' $flw > .exp.tmp;
+    sync;
+    # rename the file to the original file
+    mv .exp.tmp $flw;
+    chmod $perms $flw;
+    sync;
+done
+
+# remove tabs
+chtab=$'\t'; # only works in bash but not in sh
+LSt=`grep -lrsi --exclude="Makefile" --exclude-dir=".git" "$chtab" .`;
+for flt in $LSt
+do
+    echo "HAS TABS: " $flt; # just to see a list of the files that include unwanted tabs
+    perms=`stat -c '%a' $flt`;
+    # remove tabs
+    expand $flt > .exp.tmp;
+    sync;
+    # rename the file to the original file
+    mv .exp.tmp $flt;
+    chmod $perms $flt;
+    sync;
+done
+
diff --git a/review.sh b/review.sh
new file mode 100755
index 0000000..7ccb021
--- /dev/null
+++ b/review.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+#
+#  Copyright 2011-12 ARM Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# NE10 Library : review.sh
+#
+
+BRANCH=$1
+
+BASE=${2-"master"}
+
+if [ "$BRANCH" = "" ]; then
+  echo "Usage: review.sh <branch to review> [parent branch]"
+  exit
+else
+
+  LABEL=`echo $1 | perl -pe '$_ =~ /dev\/([a-zA-Z0-9]+)\/(.+)/;$_=$2'`
+  GLUSER=`echo $1 | perl -pe '$_ =~ /dev\/([a-zA-Z0-9]+)\/(.+)/;$_=$1'`
+
+  NEWBRANCH="staging/$GLUSER/$LABEL"
+
+  echo "Pushing $BRANCH from $BASE for review as $NEWBRANCH"
+
+  git branch $NEWBRANCH $BASE
+  git push origin $NEWBRANCH
+  git checkout $NEWBRANCH
+  git rebase $BRANCH
+  git push origin $NEWBRANCH
+
+fi
+
diff --git a/runperf.sh b/runperf.sh
new file mode 100755
index 0000000..a554d55
--- /dev/null
+++ b/runperf.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+#
+#  Copyright 2011-12 ARM Limited
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+#
+# NE10 Library : runperf.sh
+#
+#stty echo
+#clear
+# NOTE: the following string comparisons differ between BASH and SH
+if [ ! -n "$1" ]; then exit; fi
+if [ ! -n "$2" ]; then exit; fi
+if [ ! -e "./$1.test_r.ex" ]; then exit; fi
+./$1.test_r.ex 0
+OP_COUNT=$?
+IMPL_COUNT=3
+ITERATIONS=$2
+PERF_CMD="perf stat -e cycles,instructions,cache-references,cache-misses,branches,branch-misses,bus-cycles,cpu-clock,task-clock,faults,minor-faults,major-faults,context-switches,migrations,alignment-faults,emulation-faults -x,"
+#stty echo
+rm res_*_$1_*.txt
+for o in $(seq $OP_COUNT)
+do
+  ./$1.test_r.ex $o 0 $ITERATIONS
+  RET=$?
+  if [ "$RET" -ne "0" ]; then
+     echo " SEND MAIL ~~ ERROR: Unit [$1] operation [$o] has returned with error code $RET...";
+     #continue; # if one of the operations in a unit has a mismatching implementation it doesnt mean that all other op's would do too
+     # dont skip the operation, try different implementations
+     if [ "$RET" -eq "10" ]; then
+        exit $RET;
+     fi
+  fi
+  for i in $(seq $IMPL_COUNT)
+  do
+   #./NE10_mulc_test_r.ex $o $i $ITERATIONS
+   #echo "TEST#"$o";"$i >>res_err.txt
+     ./$1.test_r.ex $o $i $ITERATIONS 1>/dev/null 2>/dev/null
+     RET=$?
+     if [ "$RET" -ne "0" ]; then
+        echo "ERROR;./$1.test_r.ex $o $i $ITERATIONS $RET"
+        exit $RET;
+     else
+        STDOUT_FILE="res_std_"$1_$o"_"$i"_"$ITERATIONS".txt";
+        STDERR_FILE="res_err_"$1_$o"_"$i"_"$ITERATIONS".txt";
+        echo "$STDOUT_FILE" > $STDOUT_FILE;
+        echo "$STDERR_FILE" > $STDERR_FILE;
+        $PERF_CMD ./$1.test_r.ex $o $i $ITERATIONS 1>>$STDOUT_FILE 2>>$STDERR_FILE;
+     fi
+   #echo $o $i;
+   #q=12;
+  done
+done
+#stty echo
diff --git a/source/NE10_abs.asm.s b/source/NE10_abs.asm.s
new file mode 100644
index 0000000..43a79ee
--- /dev/null
+++ b/source/NE10_abs.asm.s
@@ -0,0 +1,61 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_abs.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   abs_float_asm
+        .thumb
+        .thumb_func
+
+abs_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t abs_float(arm_float_t * dst,
+        @                 arm_float_t * src,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz     r2, .LoopEndFloat
+        mov     r3, #0
+        vmov    s2, r3
+
+.LoopBeginFloat:
+        vldr      s1, [r1]                @ Load s1 = src[i]
+        add       r1, r1, #4              @ move to the next item
+        vabs.f32  s1, s1                  @ get the absolute value; s1 = abs(s1 - 0)
+        vstr      s1, [r0]                @ Store it back into the main memory; dst[i] = s1
+        add       r0, r0, #4              @ move to the next entry
+        subs      r2, r2, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        bx      lr
diff --git a/source/NE10_abs.c b/source/NE10_abs.c
new file mode 100644
index 0000000..6e76378
--- /dev/null
+++ b/source/NE10_abs.c
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_abs.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+#include <math.h>
+
+arm_result_t abs_float_c(arm_float_t * dst, arm_float_t * src, unsigned int count)
+{
+  NE10_ABS_OPERATION_X_C
+  (
+    dst[itr] = fabs( src[itr] );
+  );
+}
+
+arm_result_t abs_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count)
+{
+  NE10_ABS_OPERATION_X_C
+  (
+    dst[ itr ].x = fabs( src[ itr ].x );
+    dst[ itr ].y = fabs( src[ itr ].y );
+  );
+}
+
+arm_result_t abs_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count)
+{
+  NE10_ABS_OPERATION_X_C
+  (
+    dst[ itr ].x = fabs( src[ itr ].x );
+    dst[ itr ].y = fabs( src[ itr ].y );
+    dst[ itr ].z = fabs( src[ itr ].z );
+  );
+}
+
+arm_result_t abs_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count)
+{
+  NE10_ABS_OPERATION_X_C
+  (
+    dst[ itr ].x = fabs( src[ itr ].x );
+    dst[ itr ].y = fabs( src[ itr ].y );
+    dst[ itr ].z = fabs( src[ itr ].z );
+    dst[ itr ].w = fabs( src[ itr ].w );
+  );
+}
+
diff --git a/source/NE10_abs.neon.s b/source/NE10_abs.neon.s
new file mode 100644
index 0000000..d8628bd
--- /dev/null
+++ b/source/NE10_abs.neon.s
@@ -0,0 +1,419 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_abs.neon.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+
+
+
+        .balign   4
+        .global   abs_float_neon
+        .thumb
+        .thumb_func
+
+abs_float_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t abs_float(arm_float_t * dst,
+        @                 arm_float_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cbz               r2, .L_check_float
+
+        @ load the 1st set of values
+          vld1.32         {q0}, [r1]!
+          subs            r2, r2, #4
+
+        @ absolute values of the 1st set
+          vabs.f32        q3, q0         @ q3 = abs( q0 )
+
+        @ load the 2nd set of values
+          vld1.32         {q0}, [r1]!
+          subs            r2, r2, #4
+
+          ble             .L_mainloopend_float
+
+.L_mainloop_float:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst1.32         {d6,d7}, [r0]!
+
+        @ absolute values of the 2nd/next (e.g. 3rd) set
+          vabs.f32        q3, q0         @ q3 = abs( q0 )
+
+       @ load the next (e.g. 3rd) set of values
+        vld1.32           {q0}, [r1]!
+        subs              r2, r2, #4
+
+        bgt             .L_mainloop_float             @ loop if r2 is > r3, if we have at least another 4 floats
+
+.L_mainloopend_float:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst1.32         {d6,d7}, [r0]!
+
+        @ absolute values of the last (e.g. 3rd) set
+          vabs.f32        q3, q0
+
+        @ store the result for the last (e.g. 3rd) set
+          vst1.32         {d6,d7}, [r0]!
+
+.L_check_float:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_float
+
+.L_secondloop_float:
+     @ process the last few items left in the input array
+        vld1.f32          d0[0], [r1]!           @ Fill in d0 = { V.x, V.y };
+
+        subs              r3, r3, #1
+
+        @ absolute values
+        vabs.f32          d0, d0
+
+        vst1.32           {d0[0]}, [r0]!
+
+        bgt               .L_secondloop_float
+
+.L_return_float:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .balign   4
+        .global   abs_vec2f_neon
+        .thumb
+        .thumb_func
+
+abs_vec2f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t abs_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cbz               r2, .L_check_vec2
+
+        @ load the 1st set of values
+          vld2.32         {q0-q1}, [r1]!
+          subs            r2, r2, #4
+
+        @ absolute values of the 1st set
+          vabs.f32        q3, q0         @ q3 = abs( q0 )
+          vabs.f32        q4, q1         @ q4 = abs( q1 )
+
+        @ load the 2nd set of values
+          vld2.32         {q0-q1}, [r1]!
+          subs            r2, r2, #4
+
+          ble             .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst2.32         {d6,d7,d8,d9}, [r0]!
+
+        @ absolute values of the 2nd/next (e.g. 3rd) set
+          vabs.f32        q3, q0         @ q3 = abs( q0 )
+          vabs.f32        q4, q1         @ q4 = abs( q1 )
+
+       @ load the next (e.g. 3rd) set of values
+          vld2.32         {q0-q1}, [r1]!
+          subs            r2, r2, #4
+
+        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst2.32         {d6,d7,d8,d9}, [r0]!
+
+        @ absolute values of the last (e.g. 3rd) set
+          vabs.f32        q3, q0         @ q3 = abs( q0 )
+          vabs.f32        q4, q1         @ q4 = abs( q1 )
+
+        @ store the result for the last (e.g. 3rd) set
+          vst2.32         {d6,d7,d8,d9}, [r0]!
+
+.L_check_vec2:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec2
+
+.L_secondloop_vec2:
+     @ process the last few items left in the input array
+        vld1.f32          d0, [r1]!           @ Fill in d0 = { V.x, V.y };
+
+        subs              r3, r3, #1
+
+        @ absolute values
+        vabs.f32          d0, d0
+
+        vst1.32           {d0}, [r0]!
+
+        bgt               .L_secondloop_vec2
+
+.L_return_vec2:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global abs_vec3f_neon
+        .thumb
+        .thumb_func
+abs_vec3f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t abs_vec3f(arm_vec3t_t * dst,
+        @                 arm_vec3f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r2, #0
+        beq               .L_check_vec3
+
+        @ load the 1st set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          subs            r2, r2, #4
+
+        @ absolute values of the 1st set
+          vabs.f32        q5, q0
+          vabs.f32        q6, q1
+          vabs.f32        q7, q2
+
+        @ load the 2nd set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          subs            r2, r2, #4
+
+          ble             .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst3.32         {d10, d12, d14}, [r0]!
+          vst3.32         {d11, d13, d15}, [r0]!
+
+        @ absolute values of the 2nd/next (e.g. 3rd) set
+          vabs.f32        q5, q0
+          vabs.f32        q6, q1
+          vabs.f32        q7, q2
+
+       @ load the next (e.g. 3rd) set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          subs            r2, r2, #4
+
+        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst3.32         {d10, d12, d14}, [r0]!
+          vst3.32         {d11, d13, d15}, [r0]!
+
+        @ absolute values of the last (e.g. 3rd) set
+          vabs.f32        q5, q0
+          vabs.f32        q6, q1
+          vabs.f32        q7, q2
+
+        @ store the result for the last (e.g. 3rd) set
+          vst3.32         {d10, d12, d14}, [r0]!
+          vst3.32         {d11, d13, d15}, [r0]!
+
+.L_check_vec3:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec3
+
+.L_secondloop_vec3:
+     @ process the last few items left in the input array
+        vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
+                                                           @      q0 = { V.x, -, -, - };
+                                                           @      q1 = { V.y, -, -, - };
+                                                           @      q2 = { V.z, -, -, - };
+        subs              r3, r3, #1
+
+        @ absolute values
+        vabs.f32          d0, d0
+        vabs.f32          d1, d1
+        vabs.f32          d2, d2
+
+        vst3.32           {d0[0], d2[0], d4[0]}, [r0]!
+
+        bgt               .L_secondloop_vec3
+
+.L_return_vec3:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global abs_vec4f_neon
+        .thumb
+        .thumb_func
+abs_vec4f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t abs_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r2, #0
+        beq               .L_check_vec4
+
+        @ load the 1st set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          subs            r2, r2, #4
+
+        @ absolute values of the 1st set
+          vabs.f32        q10, q0
+          vabs.f32        q11, q1
+          vabs.f32        q12, q2
+          vabs.f32        q13, q3
+
+        @ load the 2nd set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          subs            r2, r2, #4
+
+          ble             .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+        @ absolute values of the 2nd/next (e.g. 3rd) set
+          vabs.f32        q10, q0
+          vabs.f32        q11, q1
+          vabs.f32        q12, q2
+          vabs.f32        q13, q3
+
+       @ load the next (e.g. 3rd) set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          subs            r2, r2, #4
+
+        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_vec4:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+        @ absolute values of the last (e.g. 3rd) set
+          vabs.f32        q10, q0
+          vabs.f32        q11, q1
+          vabs.f32        q12, q2
+          vabs.f32        q13, q3
+
+        @ store the result for the last (e.g. 3rd) set
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+.L_check_vec4:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec4
+
+.L_secondloop_vec4:
+     @ process the last few items left in the input array
+        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V.x, -, -, - };
+                                                                  @      q1 = { V.y, -, -, - };
+                                                                  @      q2 = { V.z, -, -, - };
+        subs              r3, r3, #1
+
+        @ absolute values
+        vabs.f32          d0, d0
+        vabs.f32          d1, d1
+        vabs.f32          d2, d2
+        vabs.f32          d3, d3
+
+        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!     @ The values are loaded like so:
+
+        bgt               .L_secondloop_vec4
+
+.L_return_vec4:
+     @ return
+        mov               r0, #0
+        bx                lr
diff --git a/source/NE10_abs_test.c b/source/NE10_abs_test.c
new file mode 100644
index 0000000..e5f0d7b
--- /dev/null
+++ b/source/NE10_abs_test.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_abs_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_abs_operation_x.h"
+
+extern arm_result_t abs_float_c   (arm_float_t * dst, arm_float_t * src, unsigned int count);
+//extern arm_result_t abs_float_asm (arm_float_t * dst, arm_float_t * src, unsigned int count); // the assembly versions haven't been implemented; these are for future use
+extern arm_result_t abs_float_neon(arm_float_t * dst, arm_float_t * src, unsigned int count);
+
+extern arm_result_t abs_vec2f_c   (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+//extern arm_result_t abs_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count); // the assembly versions haven't been implemented; these are for future use
+extern arm_result_t abs_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+
+extern arm_result_t abs_vec3f_c   (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+//extern arm_result_t abs_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t abs_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+
+extern arm_result_t abs_vec4f_c   (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+//extern arm_result_t abs_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t abs_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_3args_t) abs_float_c;
+   ftbl[ 1] = (arm_func_3args_t) abs_float_c; // using the c version in place of the assembly version
+   ftbl[ 2] = (arm_func_3args_t) abs_float_neon;
+
+   ftbl[ 3] = (arm_func_3args_t) abs_vec2f_c;
+   ftbl[ 4] = (arm_func_3args_t) abs_vec2f_c; // using the c version in place of the assembly version
+   ftbl[ 5] = (arm_func_3args_t) abs_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_3args_t) abs_vec3f_c;
+   ftbl[ 7] = (arm_func_3args_t) abs_vec3f_c; // using the c version in place of the assembly version
+   ftbl[ 8] = (arm_func_3args_t) abs_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_3args_t) abs_vec4f_c;
+   ftbl[10] = (arm_func_3args_t) abs_vec4f_c; // using the c version in place of the assembly version
+   ftbl[11] = (arm_func_3args_t) abs_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_add.asm.s b/source/NE10_add.asm.s
new file mode 100644
index 0000000..fe4a04e
--- /dev/null
+++ b/source/NE10_add.asm.s
@@ -0,0 +1,61 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_add.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   add_float_asm
+        .thumb
+        .thumb_func
+
+add_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t add_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src1, const arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+        @  r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+        @  r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz     r3, .LoopEndFloat
+
+.LoopBeginFloat:
+        vldr      s1, [r1]                @ Load s1 = src1[i]
+        add       r1, r1, #4              @ move to the next entry
+        vldr      s2, [r2]                @ Load s2 = src2[i]
+        add       r2, r2, #4              @ next entry
+        vadd.f32  s10, s1, s2             @ s10 = src1[i] + src2[i]
+        vstr      s10, [r0]               @ Store the result back into the main memory
+        add       r0, r0, #4              @ next entry in the dst
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        bx      lr
diff --git a/source/NE10_add.c b/source/NE10_add.c
new file mode 100644
index 0000000..3ec84a1
--- /dev/null
+++ b/source/NE10_add.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_add.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t add_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ] = src1[ itr ] + src2[ itr ];
+  );
+}
diff --git a/source/NE10_add.neon.c b/source/NE10_add.neon.c
new file mode 100644
index 0000000..d2d2d93
--- /dev/null
+++ b/source/NE10_add.neon.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_add.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+arm_result_t add_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+    NE10_X_OPERATION_FLOAT_NEON
+    (
+        n_dst = vaddq_f32( n_src , n_src2 );
+        ,
+        n_tmp_src = vadd_f32( n_tmp_src, n_tmp_src2 );
+    );
+}
diff --git a/source/NE10_add_test.c b/source/NE10_add_test.c
new file mode 100644
index 0000000..592242d
--- /dev/null
+++ b/source/NE10_add_test.c
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_add_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 1
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_x_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) add_float_c;
+   ftbl[ 1] = (arm_func_4args_t) add_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) add_float_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_addc.asm.s b/source/NE10_addc.asm.s
new file mode 100644
index 0000000..8ebdd1c
--- /dev/null
+++ b/source/NE10_addc.asm.s
@@ -0,0 +1,234 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_addc.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   addc_float_asm
+        .thumb
+        .thumb_func
+
+addc_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t addc_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src, const arm_float_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndFloat
+        mov     r5, #0
+
+.LoopBeginFloat:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i]
+        vmov      s3, r2                  @ Get cst into register s3
+        vadd.f32  s10, s1, s3             @ s10 = src[i] + cst
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the result back into the main memory
+        add       r5, r5, #4              @ increase the offset by 1*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   addc_vec2f_asm
+        .thumb
+        .thumb_func
+
+addc_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t addc_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec2F
+        mov     r5, #0
+
+.LoopBeginVec2F:
+
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x and src[i].y
+        vldr      s2, [r6, #4]
+        vldr      s3, [r2, #0]            @ Load cst->x and cst->y
+        vldr      s4, [r2, #4]
+        vadd.f32  s10, s1, s3             @ s10 = src[i].x + cst->x
+        vadd.f32  s11, s2, s4             @ s11 = src[i].y + cst->y
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        add       r5, r5, #8              @ increase the offset by 2*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   addc_vec3f_asm
+        .thumb
+        .thumb_func
+
+addc_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t addc_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec3F
+        mov     r5, #0
+
+.LoopBeginVec3F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , and src[i].z
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r2, #0]            @ Load cst->x, cst->y, and cst->z
+        vldr      s5, [r2, #4]
+        vldr      s6, [r2, #8]
+        vadd.f32  s10, s1, s4             @ s10 = src[i].x + cst->x
+        vadd.f32  s11, s2, s5             @ s11 = src[i].y + cst->y
+        vadd.f32  s12, s3, s6             @ s12 = src[i].z + cst->z
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        add       r5, r5, #12             @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   addc_vec4f_asm
+        .thumb
+        .thumb_func
+
+addc_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t addc_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec4F
+        mov     r5, #0
+
+.LoopBeginVec4F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , src[i].z, and w
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r6, #12]
+        vldr      s5, [r2, #0]            @ Load cst->x, cst->y, cst->z, and w
+        vldr      s6, [r2, #4]
+        vldr      s7, [r2, #8]
+        vldr      s8, [r2, #12]
+        vadd.f32  s10, s1, s5             @ s10 = src[i].x + cst->x
+        vadd.f32  s11, s2, s6             @ s11 = src[i].y + cst->y
+        vadd.f32  s12, s3, s7             @ s12 = src[i].z + cst->z
+        vadd.f32  s13, s4, s8             @ s13 = src[i].w + cst->w
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        vstr      s13, [r7, #12]
+        add       r5, r5, #16             @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
diff --git a/source/NE10_addc.c b/source/NE10_addc.c
new file mode 100644
index 0000000..f913422
--- /dev/null
+++ b/source/NE10_addc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_addc.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t addc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ] = src[ itr ] + cst;
+  );
+}
+
+arm_result_t addc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x + cst->x;
+    dst[ itr ].y = src[ itr ].y + cst->y;
+  );
+}
+
+arm_result_t addc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x + cst->x;
+    dst[ itr ].y = src[ itr ].y + cst->y;
+    dst[ itr ].z = src[ itr ].z + cst->z;
+  );
+}
+
+arm_result_t addc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x + cst->x;
+    dst[ itr ].y = src[ itr ].y + cst->y;
+    dst[ itr ].z = src[ itr ].z + cst->z;
+    dst[ itr ].w = src[ itr ].w + cst->w;
+  );
+}
diff --git a/source/NE10_addc.neon.c b/source/NE10_addc.neon.c
new file mode 100644
index 0000000..77692a2
--- /dev/null
+++ b/source/NE10_addc.neon.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_addc.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+arm_result_t addc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+    NE10_XC_OPERATION_FLOAT_NEON
+    (
+        n_dst = vaddq_f32( n_src , n_cst );
+        ,
+        n_tmp_src = vadd_f32( n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t addc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC2F_NEON
+    (
+       n_dst = vaddq_f32( n_src , n_cst );
+       ,
+       n_tmp_src = vadd_f32( n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t addc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC3F_NEON
+    (
+        n_dst1 = vaddq_f32( n_src1 , n_cst1 );
+        n_dst2 = vaddq_f32( n_src2 , n_cst2 );
+        n_dst3 = vaddq_f32( n_src3 , n_cst3 );
+        ,
+        n_tmp_src.val[0] = vadd_f32( n_tmp_src.val[0], n_tmp_cst.val[0] );  /* the X lane */
+        n_tmp_src.val[1] = vadd_f32( n_tmp_src.val[1], n_tmp_cst.val[1] );  /* the Y lane */
+        n_tmp_src.val[2] = vadd_f32( n_tmp_src.val[2], n_tmp_cst.val[2] );  /* the Z lane */
+     );
+}
+
+arm_result_t addc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC4F_NEON
+    (
+        n_dst = vaddq_f32( n_src , n_cst );
+    );
+}
diff --git a/source/NE10_addc_test.c b/source/NE10_addc_test.c
new file mode 100644
index 0000000..ccedae2
--- /dev/null
+++ b/source/NE10_addc_test.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_addc_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_xc_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) addc_float_c;
+   ftbl[ 1] = (arm_func_4args_t) addc_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) addc_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) addc_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) addc_vec2f_asm;
+   ftbl[ 5] = (arm_func_4args_t) addc_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) addc_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) addc_vec3f_asm;
+   ftbl[ 8] = (arm_func_4args_t) addc_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) addc_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) addc_vec4f_asm;
+   ftbl[11] = (arm_func_4args_t) addc_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_div.asm.s b/source/NE10_div.asm.s
new file mode 100644
index 0000000..cd92054
--- /dev/null
+++ b/source/NE10_div.asm.s
@@ -0,0 +1,61 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_div.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   div_float_asm
+        .thumb
+        .thumb_func
+
+div_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t div_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src1, const arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+        @  r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+        @  r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz     r3, .LoopEndFloat
+
+.LoopBeginFloat:
+        vldr      s1, [r1]                @ Load s1 = src1[i]
+        add       r1, r1, #4              @ move to the next entry
+        vldr      s2, [r2]                @ Load s2 = src2[i]
+        add       r2, r2, #4              @ next entry
+        vdiv.f32  s10, s1, s2             @ s10 = src1[i] / src2[i]
+        vstr      s10, [r0]               @ Store the result back into the main memory
+        add       r0, r0, #4              @ next entry in the dst
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        bx      lr
diff --git a/source/NE10_div.c b/source/NE10_div.c
new file mode 100644
index 0000000..1348de0
--- /dev/null
+++ b/source/NE10_div.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_div.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t div_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ] = src1[ itr ] / src2[ itr ];
+  );
+}
diff --git a/source/NE10_div.neon.c b/source/NE10_div.neon.c
new file mode 100644
index 0000000..df90a6a
--- /dev/null
+++ b/source/NE10_div.neon.c
@@ -0,0 +1,46 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_div.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+ #include <stdio.h>
+ #include <stdlib.h>
+
+arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+    NE10_X_OPERATION_FLOAT_NEON
+    (
+      /* a single division operation */
+      float32x4_t rec = vrecpeq_f32( n_src2 );
+      rec = vmulq_f32(vrecpsq_f32(n_src2, rec), rec);
+      rec = vmulq_f32(vrecpsq_f32(n_src2, rec), rec);
+      n_dst = vmulq_f32( n_src , rec );
+      ,
+      /* a single division operation */
+      float32x2_t rec = vrecpe_f32( n_tmp_src2 );
+      rec = vmul_f32(vrecps_f32(n_tmp_src2, rec), rec);
+      rec = vmul_f32(vrecps_f32(n_tmp_src2, rec), rec);
+      n_tmp_src = vmul_f32( n_tmp_src, rec );
+    );
+}
diff --git a/source/NE10_div_test.c b/source/NE10_div_test.c
new file mode 100644
index 0000000..8527f97
--- /dev/null
+++ b/source/NE10_div_test.c
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_div_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 1
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_x_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) div_float_c;
+   ftbl[ 1] = (arm_func_4args_t) div_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) div_float_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_divc.asm.s b/source/NE10_divc.asm.s
new file mode 100644
index 0000000..5f2ea3b
--- /dev/null
+++ b/source/NE10_divc.asm.s
@@ -0,0 +1,233 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_divc.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   divc_float_asm
+        .thumb
+        .thumb_func
+
+divc_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t divc_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src, const arm_float_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndFloat
+        mov     r5, #0
+
+.LoopBeginFloat:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i]
+        vmov      s3, r2                  @ Get cst into register s3
+        vdiv.f32  s10, s1, s3             @ s10 = src[i] / cst
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the result back into the main memory
+        add       r5, r5, #4              @ increase the offset by 1*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   divc_vec2f_asm
+        .thumb
+        .thumb_func
+
+divc_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t divc_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec2F
+        mov     r5, #0
+
+.LoopBeginVec2F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x and src[i].y
+        vldr      s2, [r6, #4]
+        vldr      s3, [r2, #0]            @ Load cst->x and cst->y
+        vldr      s4, [r2, #4]
+        vdiv.f32  s10, s1, s3             @ s10 = src[i].x / cst->x
+        vdiv.f32  s11, s2, s4             @ s11 = src[i].y / cst->y
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        add       r5, r5, #8              @ increase the offset by 2*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   divc_vec3f_asm
+        .thumb
+        .thumb_func
+
+divc_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t divc_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec3F
+        mov     r5, #0
+
+.LoopBeginVec3F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , and src[i].z
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r2, #0]            @ Load cst->x, cst->y, and cst->z
+        vldr      s5, [r2, #4]
+        vldr      s6, [r2, #8]
+        vdiv.f32  s10, s1, s4             @ s10 = src[i].x / cst->x
+        vdiv.f32  s11, s2, s5             @ s11 = src[i].y / cst->y
+        vdiv.f32  s12, s3, s6             @ s12 = src[i].z / cst->z
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        add       r5, r5, #12             @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   divc_vec4f_asm
+        .thumb
+        .thumb_func
+
+divc_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t divc_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec4F
+        mov     r5, #0
+
+.LoopBeginVec4F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , src[i].z, and w
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r6, #12]
+        vldr      s5, [r2, #0]            @ Load cst->x, cst->y, cst->z, and w
+        vldr      s6, [r2, #4]
+        vldr      s7, [r2, #8]
+        vldr      s8, [r2, #12]
+        vdiv.f32  s10, s1, s5             @ s10 = src[i].x / cst->x
+        vdiv.f32  s11, s2, s6             @ s11 = src[i].y / cst->y
+        vdiv.f32  s12, s3, s7             @ s12 = src[i].z / cst->z
+        vdiv.f32  s13, s4, s8             @ s13 = src[i].w / cst->w
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        vstr      s13, [r7, #12]
+        add       r5, r5, #16             @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
diff --git a/source/NE10_divc.c b/source/NE10_divc.c
new file mode 100644
index 0000000..cec0b48
--- /dev/null
+++ b/source/NE10_divc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_divc.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t divc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ] = src[ itr ] / cst;
+  );
+}
+
+arm_result_t divc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x / cst->x;
+    dst[ itr ].y = src[ itr ].y / cst->y;
+  );
+}
+
+arm_result_t divc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x / cst->x;
+    dst[ itr ].y = src[ itr ].y / cst->y;
+    dst[ itr ].z = src[ itr ].z / cst->z;
+  );
+}
+
+arm_result_t divc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x / cst->x;
+    dst[ itr ].y = src[ itr ].y / cst->y;
+    dst[ itr ].z = src[ itr ].z / cst->z;
+    dst[ itr ].w = src[ itr ].w / cst->w;
+  );
+}
diff --git a/source/NE10_divc.neon.c b/source/NE10_divc.neon.c
new file mode 100644
index 0000000..cc86ff5
--- /dev/null
+++ b/source/NE10_divc.neon.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_divc.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+   #include <stdio.h>
+   #include <stdlib.h>
+
+arm_result_t divc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+    unsigned int ii = 0;
+    float d[4];
+    NE10_XC_OPERATION_FLOAT_NEON
+    (
+         /* a single division operation */
+         float32x4_t rec = vrecpeq_f32( n_cst );
+         rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
+         rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
+         n_dst = vmulq_f32( n_src , rec );
+        ,
+         /* a single division operation */
+         float32x2_t rec = vrecpe_f32( n_tmp_cst );
+         rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
+         rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
+         n_tmp_src = vmul_f32( n_tmp_src, rec );
+    );
+}
+
+arm_result_t divc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC2F_NEON
+    (
+         /* a single division operation */
+         float32x4_t rec = vrecpeq_f32( n_cst );
+         rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
+         rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
+         n_dst = vmulq_f32( n_src , rec );
+        ,
+         /* a single division operation */
+         float32x2_t rec = vrecpe_f32( n_tmp_cst );
+         rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
+         rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
+         n_tmp_src = vmul_f32( n_tmp_src, rec );
+    );
+}
+
+arm_result_t divc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC3F_NEON
+    (
+         /* three division operations */
+         float32x4_t rec = vrecpeq_f32( n_cst1 );
+         rec = vmulq_f32(vrecpsq_f32(n_cst1, rec), rec);
+         rec = vmulq_f32(vrecpsq_f32(n_cst1, rec), rec);
+         n_dst1 = vmulq_f32( n_src1 , rec );
+
+         rec = vrecpeq_f32( n_cst2 );
+         rec = vmulq_f32(vrecpsq_f32(n_cst2, rec), rec);
+         rec = vmulq_f32(vrecpsq_f32(n_cst2, rec), rec);
+         n_dst2 = vmulq_f32( n_src2 , rec );
+
+         rec = vrecpeq_f32( n_cst3 );
+         rec = vmulq_f32(vrecpsq_f32(n_cst3, rec), rec);
+         rec = vmulq_f32(vrecpsq_f32(n_cst3, rec), rec);
+         n_dst3 = vmulq_f32( n_src3 , rec );
+        ,
+         /* three division operations */
+         float32x2_t rec = vrecpe_f32( n_tmp_cst.val[0] );
+         rec = vmul_f32(vrecps_f32(n_tmp_cst.val[0], rec), rec);
+         rec = vmul_f32(vrecps_f32(n_tmp_cst.val[0], rec), rec);
+         n_tmp_src.val[0] = vmul_f32( n_tmp_src.val[0] , rec );
+
+         rec = vrecpe_f32( n_tmp_cst.val[1] );
+         rec = vmul_f32(vrecps_f32(n_tmp_cst.val[1], rec), rec);
+         rec = vmul_f32(vrecps_f32(n_tmp_cst.val[1], rec), rec);
+         n_tmp_src.val[1] = vmul_f32( n_tmp_src.val[1] , rec );
+
+         rec = vrecpe_f32( n_tmp_cst.val[2] );
+         rec = vmul_f32(vrecps_f32(n_tmp_cst.val[2], rec), rec);
+         rec = vmul_f32(vrecps_f32(n_tmp_cst.val[2], rec), rec);
+         n_tmp_src.val[2] = vmul_f32( n_tmp_src.val[2] , rec );
+     );
+}
+
+arm_result_t divc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC4F_NEON
+    (
+         /* a single division operation */
+         float32x4_t rec = vrecpeq_f32( n_cst );
+         rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
+         rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
+         n_dst = vmulq_f32( n_src , rec );
+    );
+}
diff --git a/source/NE10_divc_test.c b/source/NE10_divc_test.c
new file mode 100644
index 0000000..52a8a58
--- /dev/null
+++ b/source/NE10_divc_test.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_divc_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_xc_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) divc_float_c;
+   ftbl[ 1] = (arm_func_4args_t) divc_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) divc_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) divc_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) divc_vec2f_asm;
+   ftbl[ 5] = (arm_func_4args_t) divc_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) divc_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) divc_vec3f_asm;
+   ftbl[ 8] = (arm_func_4args_t) divc_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) divc_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) divc_vec4f_asm;
+   ftbl[11] = (arm_func_4args_t) divc_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_len.asm.s b/source/NE10_len.asm.s
new file mode 100644
index 0000000..4ac5e3b
--- /dev/null
+++ b/source/NE10_len.asm.s
@@ -0,0 +1,139 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_len.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   len_vec2f_asm
+        .thumb
+        .thumb_func
+
+len_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t len_vec2f(arm_float_t * dst,
+        @                 arm_vec2f_t * src, unsigned int count)
+        @
+        @  r0: *dst and current destination item's address
+        @  r1: *src and current source item's address
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz        r2, .LoopEndVec2F
+        add        r0, r0, r2, lsl #2
+        add        r1, r1, r2, lsl #3        @ r1 = r1 + count * 8
+
+.LoopBeginVec2F:
+        vldmdb     r1!, {s10-s11}
+        vmul.f32   s14, s10, s10             @ s14 = x*x
+        vmla.f32   s14, s11, s11             @ s14 = x*x + y*y
+        vsqrt.f32  s15, s14                  @ s15 = sqrt( s14 )
+        vstmdb     r0!, {s15}                @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
+        subs       r2, r2, #1                @ decrement the loop counter
+        bne        .LoopBeginVec2F           @ loop if r4 is still positive or zero
+.LoopEndVec2F:
+        mov        r0, NE10_OK             @ Return NE10_OK
+        bx         lr
+
+
+
+
+        .balign   4
+        .global   len_vec3f_asm
+        .thumb
+        .thumb_func
+
+len_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t len_vec3f(arm_float_t * dst,
+        @                 arm_vec3f_t * src, unsigned int count)
+        @
+        @  r0: *dst and current destination item's address
+        @  r1: *src and current source item's address
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz        r2, .LoopEndVec3F
+        add        r0, r0, r2, lsl #2
+        add        r1, r1, r2, lsl #3        @ ...
+        add        r1, r1, r2, lsl #2        @ r1 = r1 + count * 12
+
+.LoopBeginVec3F:
+        vldmdb     r1!, {s10-s12}
+        vmul.f32   s14, s10, s10             @ s14 = x*x
+        vmla.f32   s14, s11, s11             @ s14 = x*x + y*y
+        vmla.f32   s14, s12, s12             @ s14 = x*x + y*y + z*z
+        vsqrt.f32  s15, s14                  @ s15 = sqrt( s14 )
+        vstmdb     r0!, {s15}                @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
+        subs       r2, r2, #1                @ decrement the loop counter
+        bne        .LoopBeginVec3F           @ loop if r4 is still positive or zero
+.LoopEndVec3F:
+        mov        r0, NE10_OK             @ Return NE10_OK
+        bx         lr
+
+
+
+
+        .balign   4
+        .global   len_vec4f_asm
+        .thumb
+        .thumb_func
+
+len_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t len_vec4f(arm_float_t * dst,
+        @                 arm_vec4f_t * src, unsigned int count)
+        @
+        @  r0: *dst and current destination item's address
+        @  r1: *src and current source item's address
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz        r2, .LoopEndVec4F
+        add        r0, r0, r2, lsl #2
+        add        r1, r1, r2, lsl #4        @ r1 = r1 + count * 16
+
+.LoopBeginVec4F:
+        vldmdb     r1!, {s10-s13}
+        vmul.f32   s14, s10, s10             @ s14 = x*x
+        vmla.f32   s14, s11, s11             @ s14 = x*x + y*y
+        vmla.f32   s14, s12, s12             @ s14 = x*x + y*y + z*z
+        vmla.f32   s14, s13, s13             @ s14 = x*x + y*y + z*z + w*w
+        vsqrt.f32  s15, s14                  @ s15 = sqrt( s14 )
+        vstmdb     r0!, {s15}                @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
+        subs       r2, r2, #1                @ decrement the loop counter
+        bne        .LoopBeginVec4F           @ loop if r4 is still positive or zero
+.LoopEndVec4F:
+        mov        r0, NE10_OK             @ Return NE10_OK
+        bx         lr
diff --git a/source/NE10_len.c b/source/NE10_len.c
new file mode 100644
index 0000000..42bf5c3
--- /dev/null
+++ b/source/NE10_len.c
@@ -0,0 +1,56 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_len.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+#include <math.h>
+
+arm_result_t len_vec2f_c(arm_float_t * dst, arm_vec2f_t * src, unsigned int count)
+{
+  NE10_LEN_OPERATION_X_C
+  (
+    dst[ itr ] = sqrt( src[ itr ].x * src[ itr ].x +
+                       src[ itr ].y * src[ itr ].y ) ;
+  );
+}
+
+arm_result_t len_vec3f_c(arm_float_t * dst, arm_vec3f_t * src, unsigned int count)
+{
+  NE10_LEN_OPERATION_X_C
+  (
+    dst[ itr ] = sqrt( src[ itr ].x * src[ itr ].x +
+                       src[ itr ].y * src[ itr ].y +
+                       src[ itr ].z * src[ itr ].z );
+  );
+}
+
+arm_result_t len_vec4f_c(arm_float_t * dst, arm_vec4f_t * src, unsigned int count)
+{
+  NE10_LEN_OPERATION_X_C
+  (
+    dst[ itr ] = sqrt( src[ itr ].x * src[ itr ].x +
+                       src[ itr ].y * src[ itr ].y +
+                       src[ itr ].z * src[ itr ].z +
+                       src[ itr ].w * src[ itr ].w );
+  );
+}
diff --git a/source/NE10_len.neon.s b/source/NE10_len.neon.s
new file mode 100644
index 0000000..6fde8b3
--- /dev/null
+++ b/source/NE10_len.neon.s
@@ -0,0 +1,354 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_len.neon.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+
+
+
+        .balign   4
+        .global   len_vec2f_neon
+        .thumb
+        .thumb_func
+
+len_vec2f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t len_vec2f(arm_float_t * dst,
+        @                 arm_vec2f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+        cbz               r2, .L_check_vec2
+
+
+        @ load values for the first iteration
+          vld2.32         {q0-q1}, [r1]!
+          subs            r2, r2, #4
+
+        @ calculate sum of square of the components
+          vmul.f32        q2, q0, q0
+          vmla.f32        q2, q1, q1
+
+          ble             .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+
+       @ load the next set of values
+        vld2.32           {q0-q1}, [r1]!
+        subs              r2, r2, #4
+
+        @ get SQRT of the last vector while loading a new vector
+          vrsqrte.f32     q3, q2
+          vmul.f32        q4, q2, q3
+          vrsqrts.f32     q4, q4, q3
+          vmul.f32        q4, q3, q4
+
+          vmul.f32        q2, q2, q4
+
+          vst1.32         {q2}, [r0]!
+
+        @ calculate sum of square of the components
+
+        vmul.f32          q2, q0, q0
+        vmla.f32          q2, q1, q1
+
+        bgt               .L_mainloop_vec2              @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+        @ the last iteration for this call
+
+        @ get SQRT of the last vector
+          vrsqrte.f32     q3, q2
+          vmul.f32        q4, q2, q3
+          vrsqrts.f32     q4, q4, q3
+          vmul.f32        q4, q3, q4
+
+          vmul.f32        q2, q2, q4
+
+          vst1.32         {q2}, [r0]!
+
+.L_check_vec2:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec2
+
+.L_secondloop_vec2:
+     @ process the last few items left in the input array
+        vld1.f32          d0, [r1]!           @ Fill in d0 = { V.x, V.y };
+
+        subs              r3, r3, #1
+
+        vmul.f32          d0, d0, d0          @  d0= { V.x^2, V.y^2 };
+        vpadd.f32         d0, d0, d0          @  d0= { V.x^2 + (V.y^2), V.y^2 + (V.x^2) }; // d0 = d0 + (d1^2)
+
+        @ get SQRT of the vector
+        vrsqrte.f32       d2, d0
+        vmul.f32          d1, d0, d2
+        vrsqrts.f32       d1, d1, d2
+        vmul.f32          d1, d2, d1
+
+        vmul.f32          d0, d0, d1
+
+        vst1.32           d0[0], [r0]!
+
+        bgt               .L_secondloop_vec2
+
+.L_return_vec2:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global len_vec3f_neon
+        .thumb
+        .thumb_func
+len_vec3f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t len_vec3f(arm_float_t * dst,
+        @                 arm_vec3f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+        cbz               r2, .L_check_vec3
+
+
+        @ load values for the first iteration
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          subs            r2, r2, #4
+
+        @ calculate sum of square of the components
+          vmul.f32        q5, q0, q0
+          vmla.f32        q5, q1, q1
+          vmla.f32        q5, q2, q2
+
+          ble             .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+       @ load the next set of values
+        vld3.32           {d0,d2,d4}, [r1]!
+        vld3.32           {d1,d3,d5}, [r1]!
+        subs              r2, r2, #4
+
+        @ get SQRT of the last vector while loading a new vector
+          vrsqrte.f32     q3, q5
+          vmul.f32        q4, q5, q3
+          vrsqrts.f32     q4, q4, q3
+          vmul.f32        q4, q3, q4
+
+          vmul.f32        q5, q5, q4
+
+          vst1.32         {q5}, [r0]!
+
+        @ calculate sum of square of the components
+        vmul.f32          q5, q0, q0
+        vmla.f32          q5, q1, q1
+        vmla.f32          q5, q2, q2
+
+        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+        @ the last iteration for this call
+
+        @ get SQRT of the last vector
+          vrsqrte.f32     q3, q5
+          vmul.f32        q4, q5, q3
+          vrsqrts.f32     q4, q4, q3
+          vmul.f32        q4, q3, q4
+
+          vmul.f32        q5, q5, q4
+
+          vst1.32         {q5}, [r0]!
+
+.L_check_vec3:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec3
+
+.L_secondloop_vec3:
+     @ process the last few items left in the input array
+        vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
+                                                           @      q0 = { V.x, -, -, - };
+                                                           @      q1 = { V.y, -, -, - };
+                                                           @      q2 = { V.z, -, -, - };
+        subs              r3, r3, #1
+
+        vmul.f32          q0, q0, q0          @  V.x^2
+        vmla.f32          q0, q1, q1          @  V.x^2 + V.y^2
+        vmla.f32          q0, q2, q2          @  V.x^2 + V.y^2 + V.z^2
+
+        @ get SQRT of the vector
+        vrsqrte.f32       q2, q0
+        vmul.f32          q1, q0, q2
+        vrsqrts.f32       q1, q1, q2
+        vmul.f32          q1, q2, q1
+
+        vmul.f32          q0, q0, q1
+
+        vst1.32           d0[0], [r0]!
+
+        bgt               .L_secondloop_vec3
+
+.L_return_vec3:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global len_vec4f_neon
+        .thumb
+        .thumb_func
+len_vec4f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t len_vec4f(arm_float_t * dst,
+        @                 arm_vec4f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+        cbz               r2, .L_check_vec4
+
+
+        @ load values for the first iteration
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          subs            r2, r2, #4
+
+        @ calculate sum of square of the components
+          vmul.f32        q5, q0, q0
+          vmla.f32        q5, q1, q1
+          vmla.f32        q5, q2, q2
+          vmla.f32        q5, q3, q3
+
+          ble             .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+       @ load the next set of values
+        vld4.32         {d0, d2, d4, d6}, [r1]!
+        vld4.32         {d1, d3, d5, d7}, [r1]!
+        subs              r2, r2, #4
+
+        @ get SQRT of the last vector while loading a new vector
+          vrsqrte.f32     q6, q5
+          vmul.f32        q4, q5, q6
+          vrsqrts.f32     q4, q4, q6
+          vmul.f32        q4, q6, q4
+
+          vmul.f32        q5, q5, q4
+
+          vst1.32         {q5}, [r0]!
+
+        @ calculate sum of square of the components
+        vmul.f32        q5, q0, q0
+        vmla.f32        q5, q1, q1
+        vmla.f32        q5, q2, q2
+        vmla.f32        q5, q3, q3
+
+        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec4:
+        @ the last iteration for this call
+
+        @ get SQRT of the last vector
+          vrsqrte.f32     q6, q5
+          vmul.f32        q4, q5, q6
+          vrsqrts.f32     q4, q4, q6
+          vmul.f32        q4, q6, q4
+
+          vmul.f32        q5, q5, q4
+
+          vst1.32         {q5}, [r0]!
+
+.L_check_vec4:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec4
+
+.L_secondloop_vec4:
+     @ process the last few items left in the input array
+        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V.x, -, -, - };
+                                                                  @      q1 = { V.y, -, -, - };
+                                                                  @      q2 = { V.z, -, -, - };
+        subs              r3, r3, #1
+
+        vmul.f32          q0, q0, q0          @  V.x^2
+        vmla.f32          q0, q1, q1          @  V.x^2 + V.y^2
+        vmla.f32          q0, q2, q2          @  V.x^2 + V.y^2 + V.z^2
+        vmla.f32          q0, q3, q3          @  V.x^2 + V.y^2 + V.z^2 + V.w^2
+
+        @ get SQRT of the vector
+        vrsqrte.f32       q2, q0
+        vmul.f32          q1, q0, q2
+        vrsqrts.f32       q1, q1, q2
+        vmul.f32          q1, q2, q1
+
+        vmul.f32          q0, q0, q1
+
+        vst1.32           d0[0], [r0]!
+
+        bgt               .L_secondloop_vec4
+
+.L_return_vec4:
+     @ return
+        mov               r0, #0
+        bx                lr
+
diff --git a/source/NE10_len_test.c b/source/NE10_len_test.c
new file mode 100644
index 0000000..e499969
--- /dev/null
+++ b/source/NE10_len_test.c
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_len_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 3
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_len_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_3args_t) len_vec2f_c;
+   ftbl[ 1] = (arm_func_3args_t) len_vec2f_asm;
+   ftbl[ 2] = (arm_func_3args_t) len_vec2f_neon;
+
+   ftbl[ 3] = (arm_func_3args_t) len_vec3f_c;
+   ftbl[ 4] = (arm_func_3args_t) len_vec3f_asm;
+   ftbl[ 5] = (arm_func_3args_t) len_vec3f_neon;
+
+   ftbl[ 6] = (arm_func_3args_t) len_vec4f_c;
+   ftbl[ 7] = (arm_func_3args_t) len_vec4f_asm;
+   ftbl[ 8] = (arm_func_3args_t) len_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_mla.asm.s b/source/NE10_mla.asm.s
new file mode 100644
index 0000000..0d39389
--- /dev/null
+++ b/source/NE10_mla.asm.s
@@ -0,0 +1,67 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mla.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   mla_float_asm
+        .thumb
+        .thumb_func
+
+mla_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mla_float(arm_vec2f_t * dst, arm_float_t * acc,
+        @                 arm_float_t * src1, const arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current src1 entry's address - made of base(r0)+offset
+        @  r1: *acc & current acc entry's address - made of base(r1)+offset
+        @  r2: *src1 & current src1 entry's address - made of base(r2)+offset
+        @  r3: *src2 & current src2 entry's address - made of base(r3)+offset
+        @  r4: int count
+        @
+        @  r4: loop counter
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4}
+        ldr     r4, [r13, #4]             @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+        cbz     r4, .LoopEndFloat
+
+.LoopBeginFloat:
+        vldr      s10, [r1]               @ Load s10 = acc[i]
+        vldr      s1, [r2]                @ Load s1 = src1[i]
+        vldr      s2, [r3]                @ Load s2 = src2[i]
+        add       r1, r1, #4              @ move to the next acc entry
+        add       r2, r2, #4              @ move to the next src1 entry
+        add       r3, r3, #4              @ next entry in src2
+        vmla.f32  s10, s1, s2             @ s10 = acc[i] + (src1[i] * src2[i])
+        vstr      s10, [r0]               @ Store the result back into the main memory
+        add       r0, r0, #4              @ next entry in the dst
+        subs      r4, r4, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4}
+        bx      lr
diff --git a/source/NE10_mla.c b/source/NE10_mla.c
new file mode 100644
index 0000000..8842ecd
--- /dev/null
+++ b/source/NE10_mla.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mla.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t mla_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ] = acc[ itr ] + (src1[ itr ] * src2[ itr ]);
+  );
+}
diff --git a/source/NE10_mla.neon.c b/source/NE10_mla.neon.c
new file mode 100644
index 0000000..b975623
--- /dev/null
+++ b/source/NE10_mla.neon.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mla.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+arm_result_t mla_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+    NE10_MLA_OPERATION_FLOAT_NEON
+    (
+        n_dst = vmlaq_f32( n_acc, n_src, n_src2 );
+        ,
+        n_tmp_src = vmla_f32( n_tmp_acc, n_tmp_src, n_tmp_src2 );
+    );
+}
diff --git a/source/NE10_mla_test.c b/source/NE10_mla_test.c
new file mode 100644
index 0000000..9b1ac59
--- /dev/null
+++ b/source/NE10_mla_test.c
@@ -0,0 +1,45 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mla_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 1
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_mla_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_5args_t) mla_float_c;
+   ftbl[ 1] = (arm_func_5args_t) mla_float_asm;
+   ftbl[ 2] = (arm_func_5args_t) mla_float_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_mlac.asm.s b/source/NE10_mlac.asm.s
new file mode 100644
index 0000000..79d0104
--- /dev/null
+++ b/source/NE10_mlac.asm.s
@@ -0,0 +1,259 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mlac.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   mlac_float_asm
+        .thumb
+        .thumb_func
+
+mlac_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mlac_float(arm_vec2f_t * dst, arm_vec2f_t * acc,
+        @                 arm_float_t * src, const arm_float_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *acc
+        @  r2: *src
+        @  r3: cst
+        @  r4: int count
+        @
+        @  r4: loop counter
+        @  r5: current item's offset in acc[], src[], and dst[]
+        @  r6: current accumulator item's address made of base(r1)+offset(r5)
+        @  r7: current source item's address made of base(r2)+offset(r5)
+        @  r8: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7, r8}
+        ldr     r4, [r13, #20]             @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+        cbz     r4, .LoopEndFloat
+        mov     r5, #0
+
+.LoopBeginFloat:
+        add       r6, r1, r5              @ Get current accumulator item's address in memory
+        vldr      s10, [r6, #0]           @ Load acc[i]
+        add       r7, r2, r5              @ Get current source item's address in memory
+        vldr      s2, [r7, #0]            @ Load src[i]
+        vmov      s3, r3                  @ Get cst into register s3
+        vmla.f32  s10, s2, s3             @ s10 = acc[i] + ( src[i] * cst )
+        add       r8, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r8, #0]           @ Store the result back into the main memory
+        add       r5, r5, #4              @ increase the offset by 1*sizeof(float) @@ (for x and y)
+        subs      r4, r4, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7, r8}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   mlac_vec2f_asm
+        .thumb
+        .thumb_func
+
+mlac_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mlac_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *acc
+        @  r2: *src
+        @  r3: *cst
+        @  r4: int count
+        @
+        @  r4: loop counter
+        @  r5: current item's offset in acc[], src[], and dst[]
+        @  r6: current accumulator item's address made of base(r1)+offset(r5)
+        @  r7: current source item's address made of base(r2)+offset(r5)
+        @  r8: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7, r8}
+        ldr     r4, [r13, #20]             @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+        cbz     r4, .LoopEndVec2F
+        mov     r5, #0
+
+.LoopBeginVec2F:
+        add       r6, r1, r5              @ Get current accumulator item's address in memory
+        vldr      s10, [r6, #0]           @ Load acc[i].x and acc[i].y
+        vldr      s11, [r6, #4]
+        add       r7, r2, r5              @ Get current source item's address in memory
+        vldr      s1, [r7, #0]            @ Load src[i].x and src[i].y
+        vldr      s2, [r7, #4]
+        vldr      s3, [r3, #0]            @ Load cst->x and cst->y
+        vldr      s4, [r3, #4]
+        vmla.f32  s10, s1, s3             @ s10 = acc[i].x + ( src[i].x * cst->x )
+        vmla.f32  s11, s2, s4
+        add       r8, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r8, #0]           @ Store the results back into the main memory
+        vstr      s11, [r8, #4]
+        add       r5, r5, #8              @ increase the offset by 2*sizeof(float) @@ (for x and y)
+        subs      r4, r4, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7, r8}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   mlac_vec3f_asm
+        .thumb
+        .thumb_func
+
+mlac_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mlac_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *acc
+        @  r2: *src
+        @  r3: *cst
+        @  r4: int count
+        @
+        @  r4: loop counter
+        @  r5: current item's offset in acc[], src[], and dst[]
+        @  r6: current accumulator item's address made of base(r1)+offset(r5)
+        @  r7: current source item's address made of base(r2)+offset(r5)
+        @  r8: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7, r8}
+        ldr     r4, [r13, #20]             @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+        cbz     r4, .LoopEndVec3F
+        mov     r5, #0
+
+.LoopBeginVec3F:
+        add       r6, r1, r5              @ Get current accumulator item's address in memory
+        vldr      s10, [r6, #0]           @ Load acc[i].x, acc[i].y , and acc[i].z
+        vldr      s11, [r6, #4]
+        vldr      s12, [r6, #8]
+        add       r7, r2, r5              @ Get current source item's address in memory
+        vldr      s1, [r7, #0]            @ Load src[i].x, src[i].y , and src[i].z
+        vldr      s2, [r7, #4]
+        vldr      s3, [r7, #8]
+        vldr      s4, [r3, #0]            @ Load cst->x, cst->y, and cst->z
+        vldr      s5, [r3, #4]
+        vldr      s6, [r3, #8]
+        vmla.f32  s10, s1, s4             @ s10 = acc[i].x + ( src[i].x * cst->x )
+        vmla.f32  s11, s2, s5             @  same for 'y'
+        vmla.f32  s12, s3, s6             @  same for 'z'
+        add       r8, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r8, #0]           @ Store the results back into the main memory
+        vstr      s11, [r8, #4]
+        vstr      s12, [r8, #8]
+        add       r5, r5, #12             @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+        subs      r4, r4, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7, r8}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   mlac_vec4f_asm
+        .thumb
+        .thumb_func
+
+mlac_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mlac_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *acc
+        @  r2: *src
+        @  r3: *cst
+        @  r4: int count
+        @
+        @  r4: loop counter
+        @  r5: current item's offset in acc[], src[], and dst[]
+        @  r6: current accumulator item's address made of base(r1)+offset(r5)
+        @  r7: current source item's address made of base(r2)+offset(r5)
+        @  r8: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7, r8}
+        ldr     r4, [r13, #20]             @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+        cbz     r4, .LoopEndVec4F
+        mov     r5, #0
+
+.LoopBeginVec4F:
+        add       r6, r1, r5              @ Get current accumulator item's address in memory
+        vldr      s10, [r6, #0]           @ Load acc[i].x, acc[i].y , acc[i].z, and w
+        vldr      s11, [r6, #4]
+        vldr      s12, [r6, #8]
+        vldr      s13, [r6, #12]
+        add       r7, r2, r5              @ Get current source item's address in memory
+        vldr      s1, [r7, #0]            @ Load src[i].x, src[i].y , src[i].z, and w
+        vldr      s2, [r7, #4]
+        vldr      s3, [r7, #8]
+        vldr      s4, [r7, #12]
+        vldr      s5, [r3, #0]            @ Load cst->x, cst->y, cst->z, and w
+        vldr      s6, [r3, #4]
+        vldr      s7, [r3, #8]
+        vldr      s8, [r3, #12]
+        vmla.f32  s10, s1, s5             @ s10 = acc[i].x + ( src[i].x * cst->x )
+        vmla.f32  s11, s2, s6             @  same for 'y'
+        vmla.f32  s12, s3, s7             @  same for 'z'
+        vmla.f32  s13, s4, s8             @  same for 'w'
+        add       r8, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r8, #0]           @ Store the results back into the main memory
+        vstr      s11, [r8, #4]
+        vstr      s12, [r8, #8]
+        vstr      s13, [r8, #12]
+        add       r5, r5, #16             @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+        subs      r4, r4, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7, r8}
+        bx      lr
diff --git a/source/NE10_mlac.c b/source/NE10_mlac.c
new file mode 100644
index 0000000..f719b05
--- /dev/null
+++ b/source/NE10_mlac.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mlac.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t mlac_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+  NE10_MLAC_OPERATION_X_C
+  (
+    dst[ itr ] = acc[ itr ] + (src[ itr ] * cst);
+  );
+}
+
+arm_result_t mlac_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_MLAC_OPERATION_X_C
+  (
+    dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
+    dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
+  );
+}
+
+arm_result_t mlac_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_MLAC_OPERATION_X_C
+  (
+    dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
+    dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
+    dst[ itr ].z = acc[ itr ].z + (src[ itr ].z * cst->z);
+  );
+}
+
+arm_result_t mlac_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_MLAC_OPERATION_X_C
+  (
+    dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
+    dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
+    dst[ itr ].z = acc[ itr ].z + (src[ itr ].z * cst->z);
+    dst[ itr ].w = acc[ itr ].w + (src[ itr ].w * cst->w);
+  );
+}
diff --git a/source/NE10_mlac.neon.c b/source/NE10_mlac.neon.c
new file mode 100644
index 0000000..800857e
--- /dev/null
+++ b/source/NE10_mlac.neon.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mlac.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+arm_result_t mlac_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+    NE10_MLAC_OPERATION_FLOAT_NEON
+    (
+        n_dst = vmlaq_f32( n_acc, n_src, n_cst );
+        ,
+        n_tmp_src = vmla_f32( n_tmp_acc, n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t mlac_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_MLAC_OPERATION_VEC2F_NEON
+    (
+       n_dst = vmlaq_f32( n_acc, n_src , n_cst );
+       ,
+       n_tmp_src = vmla_f32( n_tmp_acc, n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t mlac_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_MLAC_OPERATION_VEC3F_NEON
+    (
+        n_dst1 = vmlaq_f32( n_acc1, n_src1 , n_cst1 );
+        n_dst2 = vmlaq_f32( n_acc2, n_src2 , n_cst2 );
+        n_dst3 = vmlaq_f32( n_acc3, n_src3 , n_cst3 );
+        ,
+        n_tmp_src.val[0] = vmla_f32( n_tmp_acc.val[0], n_tmp_src.val[0], n_tmp_cst.val[0] );  /* the X lane */
+        n_tmp_src.val[1] = vmla_f32( n_tmp_acc.val[1], n_tmp_src.val[1], n_tmp_cst.val[1] );  /* the Y lane */
+        n_tmp_src.val[2] = vmla_f32( n_tmp_acc.val[2], n_tmp_src.val[2], n_tmp_cst.val[2] );  /* the Z lane */
+     );
+}
+
+arm_result_t mlac_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_MLAC_OPERATION_VEC4F_NEON
+    (
+        n_dst = vmlaq_f32( n_acc, n_src , n_cst );
+    );
+}
diff --git a/source/NE10_mlac_test.c b/source/NE10_mlac_test.c
new file mode 100644
index 0000000..7c93700
--- /dev/null
+++ b/source/NE10_mlac_test.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mlac_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_mlac_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_5args_t) mlac_float_c;
+   ftbl[ 1] = (arm_func_5args_t) mlac_float_asm;
+   ftbl[ 2] = (arm_func_5args_t) mlac_float_neon;
+
+   ftbl[ 3] = (arm_func_5args_t) mlac_vec2f_c;
+   ftbl[ 4] = (arm_func_5args_t) mlac_vec2f_asm;
+   ftbl[ 5] = (arm_func_5args_t) mlac_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_5args_t) mlac_vec3f_c;
+   ftbl[ 7] = (arm_func_5args_t) mlac_vec3f_asm;
+   ftbl[ 8] = (arm_func_5args_t) mlac_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_5args_t) mlac_vec4f_c;
+   ftbl[10] = (arm_func_5args_t) mlac_vec4f_asm;
+   ftbl[11] = (arm_func_5args_t) mlac_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_mul.asm.s b/source/NE10_mul.asm.s
new file mode 100644
index 0000000..47070b8
--- /dev/null
+++ b/source/NE10_mul.asm.s
@@ -0,0 +1,61 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mul.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   mul_float_asm
+        .thumb
+        .thumb_func
+
+mul_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mul_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src1, const arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+        @  r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+        @  r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz     r3, .LoopEndFloat
+
+.LoopBeginFloat:
+        vldr      s1, [r1]                @ Load s1 = src1[i]
+        add       r1, r1, #4              @ move to the next entry
+        vldr      s2, [r2]                @ Load s2 = src2[i]
+        add       r2, r2, #4              @ next entry
+        vmul.f32  s10, s1, s2             @ s10 = src1[i] * src2[i]
+        vstr      s10, [r0]               @ Store the result back into the main memory
+        add       r0, r0, #4              @ next entry in the dst
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        bx      lr
diff --git a/source/NE10_mul.c b/source/NE10_mul.c
new file mode 100644
index 0000000..cc0fe58
--- /dev/null
+++ b/source/NE10_mul.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mul.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t mul_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ] = src1[ itr ] * src2[ itr ];
+  );
+}
+
+arm_result_t vmul_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
+    dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
+  );
+}
+
+arm_result_t vmul_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
+    dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
+    dst[ itr ].z = src1[ itr ].z * src2[ itr ].z;
+  );
+}
+
+arm_result_t vmul_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
+    dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
+    dst[ itr ].z = src1[ itr ].z * src2[ itr ].z;
+    dst[ itr ].w = src1[ itr ].w * src2[ itr ].w;
+  );
+}
diff --git a/source/NE10_mul.neon.s b/source/NE10_mul.neon.s
new file mode 100644
index 0000000..3d5080d
--- /dev/null
+++ b/source/NE10_mul.neon.s
@@ -0,0 +1,470 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mul.neon.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+
+
+
+        .balign   4
+        .global   mul_float_neon
+        .thumb
+        .thumb_func
+
+mul_float_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mul_float(arm_float_t * dst,
+        @                 arm_float_t * src1,
+        @                 arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r4; This is what's left to be processed after this loop
+
+        cbz               r3, .L_check_float
+
+        @ load the 1st set of values
+          vld1.32         {q0}, [r1]!
+          vld1.32         {q1}, [r2]!
+          subs            r3, r3, #4
+
+        @ calculate values for the 1st set
+          vmul.f32        q3, q0, q1         @ q3 = q0 * q1
+
+        @ load the 2nd set of values
+          vld1.32         {q0}, [r1]!
+          vld1.32         {q1}, [r2]!
+          subs            r3, r3, #4
+
+          ble             .L_mainloopend_float
+
+.L_mainloop_float:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst1.32         {d6,d7}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          vmul.f32        q3, q0, q1         @ q3 = q0 * q1
+
+       @ load the next (e.g. 3rd) set of values
+          vld1.32         {q0}, [r1]!
+          vld1.32         {q1}, [r2]!
+          subs            r3, r3, #4
+
+        bgt             .L_mainloop_float             @ loop if r2 is > r3, if we have at least another 4 floats
+
+.L_mainloopend_float:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst1.32         {d6,d7}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          vmul.f32        q3, q0, q1         @ q3 = q0 * q1
+
+        @ store the result for the last (e.g. 3rd) set
+          vst1.32         {d6,d7}, [r0]!
+
+
+.L_check_float:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_float
+
+.L_secondloop_float:
+     @ process the last few items left in the input array
+        vld1.f32          d0[0], [r1]!           @ Fill in d0[0]
+        vld1.f32          d1[0], [r2]!           @ Fill in d1[1]
+
+
+        subs              r4, r4, #1
+
+        @ values
+        vmul.f32          d0, d0, d1
+
+        vst1.32           {d0[0]}, [r0]!
+
+        bgt               .L_secondloop_float
+
+.L_return_float:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .balign   4
+        .global   vmul_vec2f_neon
+        .thumb
+        .thumb_func
+
+vmul_vec2f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mul_float(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src1,
+        @                 arm_vec2f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r3; This is what's left to be processed after this loop
+
+        cbz               r3, .L_check_vec2
+
+        @ load the 1st set of values
+          vld2.32         {q0-q1}, [r1]!
+          vld2.32         {q2-q3}, [r2]!
+          subs            r3, r3, #4
+
+        @ calculate values for the 1st set
+          vmul.f32        q4, q0, q2
+          vmul.f32        q5, q1, q3
+
+        @ load the 2nd set of values
+          vld2.32         {q0-q1}, [r1]!
+          vld2.32         {q2-q3}, [r2]!
+          subs            r3, r3, #4
+
+          ble             .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst2.32         {d8,d9,d10,d11}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          vmul.f32        q4, q0, q2
+          vmul.f32        q5, q1, q3
+
+       @ load the next (e.g. 3rd) set of values
+          vld2.32         {q0-q1}, [r1]!
+          vld2.32         {q2-q3}, [r2]!
+          subs            r3, r3, #4
+
+        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst2.32         {d8,d9,d10,d11}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          vmul.f32        q4, q0, q2
+          vmul.f32        q5, q1, q3
+
+        @ store the result for the last (e.g. 3rd) set
+          vst2.32         {d8,d9,d10,d11}, [r0]!
+
+.L_check_vec2:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_vec2
+
+.L_secondloop_vec2:
+     @ process the last few items left in the input array
+        vld1.f32          d0, [r1]!
+        vld1.f32          d1, [r2]!
+
+        subs              r4, r4, #1
+
+        @ calculate values
+        vmul.f32          d0, d0, d1
+
+        vst1.32           {d0}, [r0]!
+
+        bgt               .L_secondloop_vec2
+
+.L_return_vec2:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global vmul_vec3f_neon
+        .thumb
+        .thumb_func
+vmul_vec3f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mul_float(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src1,
+        @                 arm_vec3f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r3 = count % 4;
+        sub               r3, r3, r4          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_vec3
+
+        @ load the 1st set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          vld3.32         {d6, d8, d10}, [r2]!
+          vld3.32         {d7, d9, d11}, [r2]!
+          subs            r3, r3, #4
+
+        @ calculate values for the 1st set
+          vmul.f32        q10, q0, q3
+          vmul.f32        q11, q1, q4
+          vmul.f32        q12, q2, q5
+
+        @ load the 2nd set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          vld3.32         {d6, d8, d10}, [r2]!
+          vld3.32         {d7, d9, d11}, [r2]!
+          subs            r3, r3, #4
+
+          ble             .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst3.32         {d20, d22, d24}, [r0]!
+          vst3.32         {d21, d23, d25}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          vmul.f32        q10, q0, q3
+          vmul.f32        q11, q1, q4
+          vmul.f32        q12, q2, q5
+
+       @ load the next (e.g. 3rd) set of values
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          vld3.32         {d6, d8, d10}, [r2]!
+          vld3.32         {d7, d9, d11}, [r2]!
+          subs            r3, r3, #4
+
+        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst3.32         {d20, d22, d24}, [r0]!
+          vst3.32         {d21, d23, d25}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          vmul.f32        q10, q0, q3
+          vmul.f32        q11, q1, q4
+          vmul.f32        q12, q2, q5
+
+        @ store the result for the last (e.g. 3rd) set
+          vst3.32         {d20, d22, d24}, [r0]!
+          vst3.32         {d21, d23, d25}, [r0]!
+
+.L_check_vec3:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_vec3
+
+.L_secondloop_vec3:
+     @ process the last few items left in the input array
+        vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
+                                                           @      q0 = { V1.x, -, -, - };
+                                                           @      q1 = { V1.y, -, -, - };
+                                                           @      q2 = { V1.z, -, -, - };
+        vld3.f32          {d1[0], d3[0], d5[0]}, [r2]!     @ The values are loaded like so:
+                                                           @      q0 = { V1.x, -, V2.x, - };
+                                                           @      q1 = { V1.y, -, V2.y, - };
+                                                           @      q2 = { V1.z, -, V2.z, - };
+
+        subs              r4, r4, #1
+
+        @ calculate values for
+        vmul.f32          d0, d0, d1
+        vmul.f32          d2, d2, d3
+        vmul.f32          d4, d4, d5
+
+        vst3.32           {d0[0], d2[0], d4[0]}, [r0]!
+
+        bgt               .L_secondloop_vec3
+
+.L_return_vec3:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global vmul_vec4f_neon
+        .thumb
+        .thumb_func
+vmul_vec4f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mul_float(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src1,
+        @                 arm_vec4f_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current dst entry's address
+        @  r1: *src1 & current src1 entry's address
+        @  r2: *src2 & current src2 entry's address
+        @  r3: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @
+        @  r4:  the number of items that are left to be processed at the end of
+        @                   the input array
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push              {r4}
+        and               r4, r3, #3          @ r4 = count % 4;
+        sub               r3, r3, r4          @ count = count - r4; This is what's left to be processed after this loop
+
+        cmp               r3, #0
+        beq               .L_check_vec4
+
+        @ load the 1st set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          vld4.32         {d8, d10, d12, d14}, [r2]!
+          vld4.32         {d9, d11, d13, d15}, [r2]!
+
+          subs            r3, r3, #4
+
+        @ calculate values for the 1st set
+          vmul.f32        q10, q0, q4
+          vmul.f32        q11, q1, q5
+          vmul.f32        q12, q2, q6
+          vmul.f32        q13, q3, q7
+
+        @ load the 2nd set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          vld4.32         {d8, d10, d12, d14}, [r2]!
+          vld4.32         {d9, d11, d13, d15}, [r2]!
+          subs            r3, r3, #4
+
+          ble             .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+        @ store the result for the 1st/next (e.g. 3rd) set
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+        @ calculate values for the 2nd/next (e.g. 3rd) set
+          vmul.f32        q10, q0, q4
+          vmul.f32        q11, q1, q5
+          vmul.f32        q12, q2, q6
+          vmul.f32        q13, q3, q7
+
+       @ load the next (e.g. 3rd) set of values
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          vld4.32         {d8, d10, d12, d14}, [r2]!
+          vld4.32         {d9, d11, d13, d15}, [r2]!
+          subs            r3, r3, #4
+
+        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_vec4:
+        @ the last iteration for this call
+        @ store the result for the set of values before the last one (e.g 2nd set)
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+        @ calculate values for the last (e.g. 3rd) set
+          vmul.f32        q10, q0, q4
+          vmul.f32        q11, q1, q5
+          vmul.f32        q12, q2, q6
+          vmul.f32        q13, q3, q7
+
+        @ store the result for the last (e.g. 3rd) set
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+.L_check_vec4:
+     @ check if anything left to process at the end of the input array
+        cmp               r4, #0
+        ble               .L_return_vec4
+
+.L_secondloop_vec4:
+     @ process the last few items left in the input array
+        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V1.x, -, -, - };
+                                                                  @      q1 = { V1.y, -, -, - };
+                                                                  @      q2 = { V1.z, -, -, - };
+                                                                  @      q3 = { V1.w, -, -, - };
+        vld4.f32          {d1[0], d3[0], d5[0], d7[0]}, [r2]!     @ The values are loaded like so:
+                                                                  @      q0 = { V1.x, -, V2.x, - };
+                                                                  @      q1 = { V1.y, -, V2.y, - };
+                                                                  @      q2 = { V1.z, -, V2.z, - };
+                                                                  @      q3 = { V1.w, -, V2.w, - };
+
+        subs              r4, r4, #1
+
+        @ calculate values
+        vmul.f32          d0, d0, d1
+        vmul.f32          d2, d2, d3
+        vmul.f32          d4, d4, d5
+        vmul.f32          d6, d6, d7
+
+        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!
+
+        bgt               .L_secondloop_vec4
+
+.L_return_vec4:
+     @ return
+        pop               {r4}
+        mov               r0, #0
+        bx                lr
diff --git a/source/NE10_mul_test.c b/source/NE10_mul_test.c
new file mode 100644
index 0000000..ec8452b
--- /dev/null
+++ b/source/NE10_mul_test.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mul_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_x_operation_x.h"
+
+extern arm_result_t mul_float_c   (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+//extern arm_result_t mul_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
+extern arm_result_t mul_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+
+extern arm_result_t vmul_vec2f_c   (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+//extern arm_result_t vmul_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
+
+extern arm_result_t vmul_vec3f_c   (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+//extern arm_result_t vmul_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
+
+extern arm_result_t vmul_vec4f_c   (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+//extern arm_result_t vmul_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+extern arm_result_t vmul_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) mul_float_c;
+   ftbl[ 1] = (arm_func_4args_t) mul_float_c; // using the c version in place of the assembly version
+   ftbl[ 2] = (arm_func_4args_t) mul_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) vmul_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) vmul_vec2f_c; // using the c version in place of the assembly version
+   ftbl[ 5] = (arm_func_4args_t) vmul_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) vmul_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) vmul_vec3f_c; // using the c version in place of the assembly version
+   ftbl[ 8] = (arm_func_4args_t) vmul_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) vmul_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) vmul_vec4f_c; // using the c version in place of the assembly version
+   ftbl[11] = (arm_func_4args_t) vmul_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_mulc.asm.s b/source/NE10_mulc.asm.s
new file mode 100644
index 0000000..43eaa1c
--- /dev/null
+++ b/source/NE10_mulc.asm.s
@@ -0,0 +1,233 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_mulc.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   mulc_float_asm
+        .thumb
+        .thumb_func
+
+mulc_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulc_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src, const arm_float_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndFloat
+        mov     r5, #0
+
+.LoopBeginFloat:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i]
+        vmov      s3, r2                  @ Get cst into register s3
+        vmul.f32  s10, s1, s3             @ s10 = src[i] * cst
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the result back into the main memory
+        add       r5, r5, #4              @ increase the offset by 1*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   mulc_vec2f_asm
+        .thumb
+        .thumb_func
+
+mulc_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulc_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec2F
+        mov     r5, #0
+
+.LoopBeginVec2F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x and src[i].y
+        vldr      s2, [r6, #4]
+        vldr      s3, [r2, #0]            @ Load cst->x and cst->y
+        vldr      s4, [r2, #4]
+        vmul.f32  s10, s1, s3             @ s10 = src[i].x * cst->x
+        vmul.f32  s11, s2, s4             @ s11 = src[i].y * cst->y
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        add       r5, r5, #8              @ increase the offset by 2*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   mulc_vec3f_asm
+        .thumb
+        .thumb_func
+
+mulc_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulc_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec3F
+        mov     r5, #0
+
+.LoopBeginVec3F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , and src[i].z
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r2, #0]            @ Load cst->x, cst->y, and cst->z
+        vldr      s5, [r2, #4]
+        vldr      s6, [r2, #8]
+        vmul.f32  s10, s1, s4             @ s10 = src[i].x * cst->x
+        vmul.f32  s11, s2, s5             @ s11 = src[i].y * cst->y
+        vmul.f32  s12, s3, s6             @ s12 = src[i].z * cst->z
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        add       r5, r5, #12             @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   mulc_vec4f_asm
+        .thumb
+        .thumb_func
+
+mulc_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t mulc_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec4F
+        mov     r5, #0
+
+.LoopBeginVec4F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , src[i].z, and w
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r6, #12]
+        vldr      s5, [r2, #0]            @ Load cst->x, cst->y, cst->z, and w
+        vldr      s6, [r2, #4]
+        vldr      s7, [r2, #8]
+        vldr      s8, [r2, #12]
+        vmul.f32  s10, s1, s5             @ s10 = src[i].x * cst->x
+        vmul.f32  s11, s2, s6             @ s11 = src[i].y * cst->y
+        vmul.f32  s12, s3, s7             @ s12 = src[i].z * cst->z
+        vmul.f32  s13, s4, s8             @ s13 = src[i].w * cst->w
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        vstr      s13, [r7, #12]
+        add       r5, r5, #16             @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
diff --git a/source/NE10_mulc.c b/source/NE10_mulc.c
new file mode 100644
index 0000000..d855c02
--- /dev/null
+++ b/source/NE10_mulc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mulc.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t mulc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ] = src[ itr ] * cst;
+  );
+}
+
+arm_result_t mulc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x * cst->x;
+    dst[ itr ].y = src[ itr ].y * cst->y;
+  );
+}
+
+arm_result_t mulc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x * cst->x;
+    dst[ itr ].y = src[ itr ].y * cst->y;
+    dst[ itr ].z = src[ itr ].z * cst->z;
+  );
+}
+
+arm_result_t mulc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x * cst->x;
+    dst[ itr ].y = src[ itr ].y * cst->y;
+    dst[ itr ].z = src[ itr ].z * cst->z;
+    dst[ itr ].w = src[ itr ].w * cst->w;
+  );
+}
diff --git a/source/NE10_mulc.neon.c b/source/NE10_mulc.neon.c
new file mode 100644
index 0000000..e0bbac7
--- /dev/null
+++ b/source/NE10_mulc.neon.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mulc.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+arm_result_t mulc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+    NE10_XC_OPERATION_FLOAT_NEON
+    (
+        n_dst = vmulq_f32( n_src , n_cst );
+        ,
+        n_tmp_src = vmul_f32( n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t mulc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC2F_NEON
+    (
+       n_dst = vmulq_f32( n_src , n_cst );
+       ,
+       n_tmp_src = vmul_f32( n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t mulc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC3F_NEON
+    (
+        n_dst1 = vmulq_f32( n_src1 , n_cst1 );
+        n_dst2 = vmulq_f32( n_src2 , n_cst2 );
+        n_dst3 = vmulq_f32( n_src3 , n_cst3 );
+        ,
+        n_tmp_src.val[0] = vmul_f32( n_tmp_src.val[0], n_tmp_cst.val[0] );
+        n_tmp_src.val[1] = vmul_f32( n_tmp_src.val[1], n_tmp_cst.val[1] );
+        n_tmp_src.val[2] = vmul_f32( n_tmp_src.val[2], n_tmp_cst.val[2] );
+     );
+}
+
+arm_result_t mulc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC4F_NEON
+    (
+        n_dst = vmulq_f32( n_src , n_cst );
+    );
+}
diff --git a/source/NE10_mulc_test.c b/source/NE10_mulc_test.c
new file mode 100644
index 0000000..df2f36e
--- /dev/null
+++ b/source/NE10_mulc_test.c
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_mulc_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_xc_operation_x.h"
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) mulc_float_c;
+   ftbl[ 1] = (arm_func_4args_t) mulc_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) mulc_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) mulc_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) mulc_vec2f_asm;
+   ftbl[ 5] = (arm_func_4args_t) mulc_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) mulc_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) mulc_vec3f_asm;
+   ftbl[ 8] = (arm_func_4args_t) mulc_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) mulc_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) mulc_vec4f_asm;
+   ftbl[11] = (arm_func_4args_t) mulc_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_normalize.asm.s b/source/NE10_normalize.asm.s
new file mode 100644
index 0000000..13b0752
--- /dev/null
+++ b/source/NE10_normalize.asm.s
@@ -0,0 +1,149 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_normalize.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   normalize_vec2f_asm
+        .thumb
+        .thumb_func
+
+normalize_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t normalize_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, unsigned int count)
+        @
+        @  r0: *dst and current destination item's address
+        @  r1: *src and current source item's address
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz        r2, .LoopEndVec2F
+        add        r0, r0, r2, lsl #3        @ r0 = r0 + count * 8
+        add        r1, r1, r2, lsl #3        @ r1 = r1 + count * 8
+
+.LoopBeginVec2F:
+        vldmdb     r1!, {s10-s11}            @ load s10 = x and S11 = y
+        vmul.f32   s14, s10, s10             @ s14 = x*x
+        vmla.f32   s14, s11, s11             @ s14 = x*x + y*y
+        vsqrt.f32  s15, s14                  @ s15 = sqrt( s14 )
+        vdiv.f32   s10, s10, s15             @ s10 = x / length
+        vdiv.f32   s11, s11, s15             @ s11 = y / length
+        vstmdb     r0!, {s10-s11}            @ store the results and move the pointer
+        subs       r2, r2, #1                @ decrement the loop counter
+        bne        .LoopBeginVec2F           @ loop if r4 is still positive or zero
+.LoopEndVec2F:
+        mov        r0, NE10_OK             @ Return NE10_OK
+        bx         lr
+
+
+
+
+        .balign   4
+        .global   normalize_vec3f_asm
+        .thumb
+        .thumb_func
+
+normalize_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t normalize_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, unsigned int count)
+        @
+        @  r0: *dst and current destination item's address
+        @  r1: *src and current source item's address
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz        r2, .LoopEndVec3F
+        add        r0, r0, r2, lsl #3        @ ...
+        add        r0, r0, r2, lsl #2        @ r0 = r0 + count * 12
+        add        r1, r1, r2, lsl #3        @ ...
+        add        r1, r1, r2, lsl #2        @ r1 = r1 + count * 12
+
+.LoopBeginVec3F:
+        vldmdb     r1!, {s10-s12}
+        vmul.f32   s14, s10, s10             @ s14 = x*x
+        vmla.f32   s14, s11, s11             @ s14 = x*x + y*y
+        vmla.f32   s14, s12, s12             @ s14 = x*x + y*y + z*z
+        vsqrt.f32  s15, s14                  @ s15 = sqrt( s14 )
+        vdiv.f32   s10, s10, s15             @ s10 = x / length
+        vdiv.f32   s11, s11, s15             @ s11 = y / length
+        vdiv.f32   s12, s12, s15             @ s12 = z / length
+        vstmdb     r0!, {s10-s12}            @ store the results and move the pointer
+        subs       r2, r2, #1                @ decrement the loop counter
+        bne        .LoopBeginVec3F           @ loop if r4 is still positive or zero
+.LoopEndVec3F:
+        mov        r0, NE10_OK             @ Return NE10_OK
+        bx         lr
+
+
+
+
+        .balign   4
+        .global   normalize_vec4f_asm
+        .thumb
+        .thumb_func
+
+normalize_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t normalize_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, unsigned int count)
+        @
+        @  r0: *dst and current destination item's address
+        @  r1: *src and current source item's address
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz        r2, .LoopEndVec4F
+        add        r0, r0, r2, lsl #4        @ r0 = r0 + count * 16
+        add        r1, r1, r2, lsl #4        @ r1 = r1 + count * 16
+
+.LoopBeginVec4F:
+        vldmdb     r1!, {s10-s13}
+        vmul.f32   s14, s10, s10             @ s14 = x*x
+        vmla.f32   s14, s11, s11             @ s14 = x*x + y*y
+        vmla.f32   s14, s12, s12             @ s14 = x*x + y*y + z*z
+        vmla.f32   s14, s13, s13             @ s14 = x*x + y*y + z*z + w*w
+        vsqrt.f32  s15, s14                  @ s15 = sqrt( s14 )
+        vdiv.f32   s10, s10, s15             @ s10 = x / length
+        vdiv.f32   s11, s11, s15             @ s11 = y / length
+        vdiv.f32   s12, s12, s15             @ s12 = z / length
+        vdiv.f32   s13, s13, s15             @ s12 = w / length
+        vstmdb     r0!, {s10-s13}            @ store the results and move the pointer
+        subs       r2, r2, #1                @ decrement the loop counter
+        bne        .LoopBeginVec4F           @ loop if r4 is still positive or zero
+.LoopEndVec4F:
+        mov        r0, NE10_OK             @ Return NE10_OK
+        bx         lr
diff --git a/source/NE10_normalize.c b/source/NE10_normalize.c
new file mode 100644
index 0000000..24164a5
--- /dev/null
+++ b/source/NE10_normalize.c
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_normalize.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+#include <math.h>
+
+arm_result_t normalize_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count)
+{
+  float len;
+
+  NE10_LEN_OPERATION_X_C
+  (
+    len = sqrt( src[ itr ].x * src[ itr ].x +
+                src[ itr ].y * src[ itr ].y ) ;
+
+    dst[ itr ].x = src[ itr ].x / len;
+    dst[ itr ].y = src[ itr ].y / len;
+  );
+}
+
+arm_result_t normalize_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count)
+{
+  float len;
+
+  NE10_LEN_OPERATION_X_C
+  (
+    len = sqrt( src[ itr ].x * src[ itr ].x +
+                src[ itr ].y * src[ itr ].y +
+                src[ itr ].z * src[ itr ].z );
+
+    dst[ itr ].x = src[ itr ].x / len;
+    dst[ itr ].y = src[ itr ].y / len;
+    dst[ itr ].z = src[ itr ].z / len;
+  );
+}
+
+arm_result_t normalize_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count)
+{
+  float len;
+
+  NE10_LEN_OPERATION_X_C
+  (
+    len = sqrt( src[ itr ].x * src[ itr ].x +
+                src[ itr ].y * src[ itr ].y +
+                src[ itr ].z * src[ itr ].z +
+                src[ itr ].w * src[ itr ].w );
+
+    dst[ itr ].x = src[ itr ].x / len;
+    dst[ itr ].y = src[ itr ].y / len;
+    dst[ itr ].z = src[ itr ].z / len;
+    dst[ itr ].w = src[ itr ].w / len;
+  );
+}
diff --git a/source/NE10_normalize.neon.s b/source/NE10_normalize.neon.s
new file mode 100644
index 0000000..adfdd56
--- /dev/null
+++ b/source/NE10_normalize.neon.s
@@ -0,0 +1,397 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_normalize.neon.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+
+
+
+        .balign   4
+        .global   normalize_vec2f_neon
+        .thumb
+        .thumb_func
+
+normalize_vec2f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t normalize_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cbz               r2, .L_check_vec2
+
+        @ load values for the first iteration
+          vld2.32         {q0-q1}, [r1]!
+          subs            r2, r2, #4
+
+        @ calculate sum of square of the components
+          vmul.f32        q2, q0, q0
+          vmla.f32        q2, q1, q1
+
+          ble             .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+       @ load the next set of values
+        vmov.f32          q10, q0
+        vmov.f32          q11, q1
+        vld2.32           {q0-q1}, [r1]!
+        subs              r2, r2, #4
+
+        @ get reciprocal SQRT of the last vector while loading a new vector
+          vrsqrte.f32     q3, q2
+          vmul.f32        q4, q2, q3
+          vrsqrts.f32     q4, q4, q3
+          vmul.f32        q4, q3, q4
+
+        @ normalize the components
+          vmul.f32        q3, q10, q4         @ q3 = q0(8) * q4
+          vmul.f32        q4, q11, q4         @ q4 = q1(9) * q4
+
+          vst2.32         {d6,d7,d8,d9}, [r0]!
+
+        @ calculate sum of square of the components
+        vmul.f32        q2, q0, q0
+        vmla.f32        q2, q1, q1
+
+        bgt             .L_mainloop_vec2             @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+        @ the last iteration for this call
+        @ get reciprocal SQRT of the last vector
+          vrsqrte.f32     q3, q2
+          vmul.f32        q4, q2, q3
+          vrsqrts.f32     q4, q4, q3
+          vmul.f32        q4, q3, q4
+
+        @ normalize the components
+          vmul.f32        q3, q0, q4          @ q3 = q0 * q4
+          vmul.f32        q4, q1, q4          @ q4 = q1 * q4
+
+        vst2.32         {d6,d7,d8,d9}, [r0]!
+
+.L_check_vec2:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec2
+
+.L_secondloop_vec2:
+     @ process the last few items left in the input array
+        vld1.f32          d0, [r1]!           @ Fill in d0 = { V.x, V.y };
+
+        subs              r3, r3, #1
+
+        @ calculate sum of square of the components
+        vmul.f32          d1, d0, d0          @  d1= { V.x^2, V.y^2 };
+        vpadd.f32         d3, d1, d1          @  d3= { V.x^2 + (V.y^2), V.y^2 + (V.x^2) };
+
+
+        @ get reciprocal SQRT of the last vector
+        vrsqrte.f32       d2, d3
+        vmul.f32          d1, d3, d2
+        vrsqrts.f32       d1, d1, d2
+        vmul.f32          d1, d2, d1
+
+        @ normalize the components
+        vmul.f32          d0, d0, d1
+
+        vst1.32           {d0}, [r0]!
+
+        bgt               .L_secondloop_vec2
+
+.L_return_vec2:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global normalize_vec3f_neon
+        .thumb
+        .thumb_func
+normalize_vec3f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t normalize_vec3f(arm_vec3t_t * dst,
+        @                 arm_vec3f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r2, #0
+        beq               .L_check_vec3
+
+        @ load values for the first iteration
+          vld3.32         {d0, d2, d4}, [r1]!
+          vld3.32         {d1, d3, d5}, [r1]!
+          subs            r2, r2, #4
+
+        @ calculate sum of square of the components
+          vmul.f32        q3, q0, q0
+          vmla.f32        q3, q1, q1
+          vmla.f32        q3, q2, q2
+
+
+          ble             .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+       @ load the next set of values
+        vmov.f32          q10, q0
+        vmov.f32          q11, q1
+        vmov.f32          q12, q2
+
+        vld3.32           {d0,d2,d4}, [r1]!
+        vld3.32           {d1,d3,d5}, [r1]!
+        subs              r2, r2, #4
+
+        @ get reciprocal SQRT of the last vector while loading a new vector
+          vrsqrte.f32     q5, q3
+          vmul.f32        q4, q3, q5
+          vrsqrts.f32     q4, q4, q5
+          vmul.f32        q4, q5, q4
+
+        @ normalize the components
+          vmul.f32        q5, q10, q4
+          vmul.f32        q6, q11, q4
+          vmul.f32        q7, q12, q4
+
+          vst3.32         {d10, d12, d14}, [r0]!
+          vst3.32         {d11, d13, d15}, [r0]!
+
+        @ calculate sum of square of the components
+        vmul.f32          q3, q0, q0
+        vmla.f32          q3, q1, q1
+        vmla.f32          q3, q2, q2
+
+        bgt               .L_mainloop_vec3             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+        @ the last iteration for this call
+        @ get reciprocal SQRT of the last vector
+          vrsqrte.f32     q5, q3
+          vmul.f32        q4, q3, q5
+          vrsqrts.f32     q4, q4, q5
+          vmul.f32        q4, q5, q4
+
+        @ normalize the components
+          vmul.f32        q5, q0, q4
+          vmul.f32        q6, q1, q4
+          vmul.f32        q7, q2, q4
+
+          vst3.32         {d10, d12, d14}, [r0]!
+          vst3.32         {d11, d13, d15}, [r0]!
+
+.L_check_vec3:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec3
+
+.L_secondloop_vec3:
+     @ process the last few items left in the input array
+        vld3.f32          {d0[0], d2[0], d4[0]}, [r1]!     @ The values are loaded like so:
+                                                           @      q0 = { V.x, -, -, - };
+                                                           @      q1 = { V.y, -, -, - };
+                                                           @      q2 = { V.z, -, -, - };
+        subs              r3, r3, #1
+
+        @ calculate sum of square of the components
+        vmul.f32          q3, q0, q0          @  V.x^2
+        vmla.f32          q3, q1, q1          @  V.x^2 + V.y^2
+        vmla.f32          q3, q2, q2          @  V.x^2 + V.y^2 + V.z^2
+
+
+        @ get reciprocal SQRT of the last vector
+        vrsqrte.f32     q5, q3
+        vmul.f32        q4, q3, q5
+        vrsqrts.f32     q4, q4, q5
+        vmul.f32        q4, q5, q4
+
+        @ normalize the components
+        vmul.f32          q0, q0, q4
+        vmul.f32          q1, q1, q4
+        vmul.f32          q2, q2, q4
+
+        vst3.32           {d0[0], d2[0], d4[0]}, [r0]!
+
+        bgt               .L_secondloop_vec3
+
+.L_return_vec3:
+     @ return
+        mov               r0, #0
+        bx                lr
+
+
+
+
+        .align  2
+        .global normalize_vec4f_neon
+        .thumb
+        .thumb_func
+normalize_vec4f_neon:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t normalize_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src,
+        @                 unsigned int count);
+        @
+        @  r0: *dst & the current dst entry's address
+        @  r1: *src & current src entry's address
+        @  r2: int count & the number of items in the input array that can be
+        @                   processed in chunks of 4 vectors
+        @  r3: the number of items that are left to be processed at the end of
+        @                   the input array
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        and               r3, r2, #3          @ r3 = count % 4;
+        sub               r2, r2, r3          @ count = count - r3; This is what's left to be processed after this loop
+
+        cmp               r2, #0
+        beq               .L_check_vec4
+
+        @ load values for the first iteration
+          vld4.32         {d0, d2, d4, d6}, [r1]!
+          vld4.32         {d1, d3, d5, d7}, [r1]!
+          subs            r2, r2, #4
+
+        @ calculate sum of square of the components
+          vmul.f32        q5, q0, q0
+          vmla.f32        q5, q1, q1
+          vmla.f32        q5, q2, q2
+          vmla.f32        q5, q3, q3
+
+          ble             .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+       @ load the next set of values
+        vmov              q10, q0
+        vmov              q11, q1
+        vmov              q12, q2
+        vmov              q13, q3
+
+        vld4.32           {d0, d2, d4, d6}, [r1]!
+        vld4.32           {d1, d3, d5, d7}, [r1]!
+        subs              r2, r2, #4
+
+        @ get reciprocal SQRT of the last vector while loading a new vector
+          vrsqrte.f32     q6, q5
+          vmul.f32        q4, q5, q6
+          vrsqrts.f32     q4, q4, q6
+          vmul.f32        q4, q6, q4
+
+        @ normalize the components
+          vmul.f32        q10, q10, q4
+          vmul.f32        q11, q11, q4
+          vmul.f32        q12, q12, q4
+          vmul.f32        q13, q13, q4
+
+          vst4.32         {d20, d22, d24, d26}, [r0]!
+          vst4.32         {d21, d23, d25, d27}, [r0]!
+
+        @ calculate sum of square of the components
+        vmul.f32        q5, q0, q0
+        vmla.f32        q5, q1, q1
+        vmla.f32        q5, q2, q2
+        vmla.f32        q5, q3, q3
+
+        bgt               .L_mainloop_vec4             @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec4:
+        @ the last iteration for this call
+        @ get reciprocal SQRT of the last vector
+          vrsqrte.f32     q6, q5
+          vmul.f32        q4, q5, q6
+          vrsqrts.f32     q4, q4, q6
+          vmul.f32        q4, q6, q4
+
+        @ normalize the components
+          vmul.f32        q0, q0, q4
+          vmul.f32        q1, q1, q4
+          vmul.f32        q2, q2, q4
+          vmul.f32        q3, q3, q4
+
+          vst4.32         {d0, d2, d4, d6}, [r0]!
+          vst4.32         {d1, d3, d5, d7}, [r0]!
+
+.L_check_vec4:
+     @ check if anything left to process at the end of the input array
+        cmp               r3, #0
+        ble               .L_return_vec4
+
+.L_secondloop_vec4:
+     @ process the last few items left in the input array
+        vld4.f32          {d0[0], d2[0], d4[0], d6[0]}, [r1]!     @ The values are loaded like so:
+                                                                  @      q0 = { V.x, -, -, - };
+                                                                  @      q1 = { V.y, -, -, - };
+                                                                  @      q2 = { V.z, -, -, - };
+        subs              r3, r3, #1
+
+        @ calculate sum of square of the components
+        vmul.f32          q4, q0, q0          @  V.x^2
+        vmla.f32          q4, q1, q1          @  V.x^2 + V.y^2
+        vmla.f32          q4, q2, q2          @  V.x^2 + V.y^2 + V.z^2
+        vmla.f32          q4, q3, q3          @  V.x^2 + V.y^2 + V.z^2 + V.w^2
+
+        @ get reciprocal SQRT of the last vector
+        vrsqrte.f32       q5, q4
+        vmul.f32          q6, q4, q5
+        vrsqrts.f32       q6, q6, q5
+        vmul.f32          q6, q5, q6
+
+        @ normalize the components
+        vmul.f32          q0, q0, q6
+        vmul.f32          q1, q1, q6
+        vmul.f32          q2, q2, q6
+        vmul.f32          q3, q3, q6
+
+        vst4.32          {d0[0], d2[0], d4[0], d6[0]}, [r0]!     @ The values are loaded like so:
+
+        bgt               .L_secondloop_vec4
+
+.L_return_vec4:
+     @ return
+        mov               r0, #0
+        bx                lr
diff --git a/source/NE10_normalize_test.c b/source/NE10_normalize_test.c
new file mode 100644
index 0000000..5420c15
--- /dev/null
+++ b/source/NE10_normalize_test.c
@@ -0,0 +1,66 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_normalize_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 3
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_normalize_operation_x.h"
+
+
+extern arm_result_t normalize_vec2f_c   (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t normalize_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+extern arm_result_t normalize_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
+
+extern arm_result_t normalize_vec3f_c   (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t normalize_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+extern arm_result_t normalize_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
+
+extern arm_result_t normalize_vec4f_c   (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t normalize_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+extern arm_result_t normalize_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_3args_t) normalize_vec2f_c;
+   ftbl[ 1] = (arm_func_3args_t) normalize_vec2f_asm;
+   ftbl[ 2] = (arm_func_3args_t) normalize_vec2f_neon;
+
+   ftbl[ 3] = (arm_func_3args_t) normalize_vec3f_c;
+   ftbl[ 4] = (arm_func_3args_t) normalize_vec3f_asm;
+   ftbl[ 5] = (arm_func_3args_t) normalize_vec3f_neon;
+
+   ftbl[ 6] = (arm_func_3args_t) normalize_vec4f_c;
+   ftbl[ 7] = (arm_func_3args_t) normalize_vec4f_asm;
+   ftbl[ 8] = (arm_func_3args_t) normalize_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_rsbc.asm.s b/source/NE10_rsbc.asm.s
new file mode 100644
index 0000000..437f54b
--- /dev/null
+++ b/source/NE10_rsbc.asm.s
@@ -0,0 +1,234 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_rsbc.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   rsbc_float_asm
+        .thumb
+        .thumb_func
+
+rsbc_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t rsbc_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src, const arm_float_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndFloat
+        mov     r5, #0
+
+.LoopBeginFloat:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i]
+        vmov      s3, r2                  @ Get cst into register s3
+        vsub.f32  s10, s3, s1             @ s10 = cst - src[i]
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the result back into the main memory
+        add       r5, r5, #4              @ increase the offset by 1*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   rsbc_vec2f_asm
+        .thumb
+        .thumb_func
+
+rsbc_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t rsbc_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec2F
+        mov     r5, #0
+
+.LoopBeginVec2F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x and src[i].y
+        vldr      s2, [r6, #4]
+        vldr      s3, [r2, #0]            @ Load cst->x and cst->y
+        vldr      s4, [r2, #4]
+        vsub.f32  s10, s3, s1             @ s10 = cst->x - src[i].x
+        vsub.f32  s11, s4, s2             @ s11 = cst->y - src[i].y
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        add       r5, r5, #8              @ increase the offset by 2*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   rsbc_vec3f_asm
+        .thumb
+        .thumb_func
+
+rsbc_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t rsbc_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec3F
+        mov     r5, #0
+
+.LoopBeginVec3F:
+
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , and src[i].z
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r2, #0]            @ Load cst->x, cst->y, and cst->z
+        vldr      s5, [r2, #4]
+        vldr      s6, [r2, #8]
+        vsub.f32  s10, s4, s1             @ s10 = cst->x - src[i].x
+        vsub.f32  s11, s5, s2             @ s11 = cst->y - src[i].y
+        vsub.f32  s12, s6, s3             @ s12 = cst->z - src[i].z
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        add       r5, r5, #12             @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   rsbc_vec4f_asm
+        .thumb
+        .thumb_func
+
+rsbc_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t rsbc_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec4F
+        mov     r5, #0
+
+.LoopBeginVec4F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , src[i].z, and w
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r6, #12]
+        vldr      s5, [r2, #0]            @ Load cst->x, cst->y, cst->z, and w
+        vldr      s6, [r2, #4]
+        vldr      s7, [r2, #8]
+        vldr      s8, [r2, #12]
+        vsub.f32  s10, s5, s1             @ s10 = cst->x - src[i].x
+        vsub.f32  s11, s6, s2             @ s11 = cst->y - src[i].y
+        vsub.f32  s12, s7, s3             @ s12 = cst->z - src[i].z
+        vsub.f32  s13, s8, s4             @ s13 = cst->w - src[i].w
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        vstr      s13, [r7, #12]
+        add       r5, r5, #16             @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
diff --git a/source/NE10_rsbc.c b/source/NE10_rsbc.c
new file mode 100644
index 0000000..1379585
--- /dev/null
+++ b/source/NE10_rsbc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_rsbc.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t rsbc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ] = cst - src[ itr ];
+  );
+}
+
+arm_result_t rsbc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = cst->x - src[ itr ].x;
+    dst[ itr ].y = cst->y - src[ itr ].y;
+  );
+}
+
+arm_result_t rsbc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = cst->x - src[ itr ].x;
+    dst[ itr ].y = cst->y - src[ itr ].y;
+    dst[ itr ].z = cst->z - src[ itr ].z;
+  );
+}
+
+arm_result_t rsbc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = cst->x - src[ itr ].x;
+    dst[ itr ].y = cst->y - src[ itr ].y;
+    dst[ itr ].z = cst->z - src[ itr ].z;
+    dst[ itr ].w = cst->w - src[ itr ].w;
+  );
+}
diff --git a/source/NE10_rsbc.neon.c b/source/NE10_rsbc.neon.c
new file mode 100644
index 0000000..b8fa234
--- /dev/null
+++ b/source/NE10_rsbc.neon.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_rsbc.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+arm_result_t rsbc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+    NE10_XC_OPERATION_FLOAT_NEON
+    (
+        n_dst = vsubq_f32( n_cst, n_src );
+        ,
+        n_tmp_src = vsub_f32( n_tmp_cst, n_tmp_src );
+    );
+}
+
+arm_result_t rsbc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC2F_NEON
+    (
+       n_dst = vsubq_f32( n_cst, n_src );
+       ,
+       n_tmp_src = vsub_f32( n_tmp_cst, n_tmp_src );
+    );
+}
+
+arm_result_t rsbc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC3F_NEON
+    (
+        n_dst1 = vsubq_f32( n_cst1, n_src1 );
+        n_dst2 = vsubq_f32( n_cst2, n_src2 );
+        n_dst3 = vsubq_f32( n_cst3, n_src3 );
+        ,
+        n_tmp_src.val[0] = vsub_f32( n_tmp_cst.val[0], n_tmp_src.val[0] );
+        n_tmp_src.val[1] = vsub_f32( n_tmp_cst.val[1], n_tmp_src.val[1] );
+        n_tmp_src.val[2] = vsub_f32( n_tmp_cst.val[2], n_tmp_src.val[2] );
+     );
+}
+
+arm_result_t rsbc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC4F_NEON
+    (
+        n_dst = vsubq_f32( n_cst, n_src );
+    );
+}
diff --git a/source/NE10_rsbc_test.c b/source/NE10_rsbc_test.c
new file mode 100644
index 0000000..d8ba2b2
--- /dev/null
+++ b/source/NE10_rsbc_test.c
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_rsbc_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_xc_operation_x.h"
+
+
+extern arm_result_t rsbc_float_c    (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t rsbc_float_asm  (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t rsbc_float_neon (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t rsbc_vec2f_c   (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t rsbc_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t rsbc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+
+extern arm_result_t rsbc_vec3f_c   (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t rsbc_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t rsbc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+
+extern arm_result_t rsbc_vec4f_c   (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+extern arm_result_t rsbc_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+extern arm_result_t rsbc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) rsbc_float_c;
+   ftbl[ 1] = (arm_func_4args_t) rsbc_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) rsbc_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) rsbc_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) rsbc_vec2f_asm;
+   ftbl[ 5] = (arm_func_4args_t) rsbc_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) rsbc_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) rsbc_vec3f_asm;
+   ftbl[ 8] = (arm_func_4args_t) rsbc_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) rsbc_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) rsbc_vec4f_asm;
+   ftbl[11] = (arm_func_4args_t) rsbc_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_setc.asm.s b/source/NE10_setc.asm.s
new file mode 100644
index 0000000..fa8f695
--- /dev/null
+++ b/source/NE10_setc.asm.s
@@ -0,0 +1,178 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_setc.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   setc_float_asm
+        .thumb
+        .thumb_func
+
+setc_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t setc_float(arm_float_t * dst,
+        @                 const arm_float_t cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: cst
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz     r2, .LoopEndFloat
+
+.LoopBeginFloat:
+        str       r1, [r0], #4            @ Store it back into the main memory
+        subs      r2, r2, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   setc_vec2f_asm
+        .thumb
+        .thumb_func
+
+setc_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t setc_vec2f(arm_vec2f_t * dst,
+        @                 const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *cst
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5}
+        cbz     r2, .LoopEndVec2F
+        ldr       r4, [r1, #0]            @ Load cst->x into r4
+        ldr       r5, [r1, #4]            @ Load cst->y into r5
+
+.LoopBeginVec2F:
+        str       r4, [r0], #4            @ Store them in the destination
+        str       r5, [r0], #4
+        subs      r2, r2, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   setc_vec3f_asm
+        .thumb
+        .thumb_func
+
+setc_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t setc_vec3f(arm_vec3f_t * dst,
+        @                 const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *cst
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6}
+        cbz     r2, .LoopEndVec3F
+        ldr       r4, [r1, #0]            @ Load cst->x into r4
+        ldr       r5, [r1, #4]            @ Load cst->y into r5
+        ldr       r6, [r1, #8]            @ r6 = cst->z
+
+.LoopBeginVec3F:
+        str       r4, [r0], #4            @ Store them in the destination
+        str       r5, [r0], #4
+        str       r6, [r0], #4
+        subs      r2, r2, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   setc_vec4f_asm
+        .thumb
+        .thumb_func
+
+setc_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t setc_vec4f(arm_vec4f_t * dst,
+        @                 const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *cst
+        @  r2: int count
+        @
+        @  r2: loop counter
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r2, .LoopEndVec4F
+        ldr       r4, [r1, #0]            @ Load cst->x into r4
+        ldr       r5, [r1, #4]            @ Load cst->y into r5
+        ldr       r6, [r1, #8]            @ r6 = cst->z
+        ldr       r7, [r1, #12]           @ r7 = cst->w
+
+.LoopBeginVec4F:
+        str       r4, [r0], #4            @ Store them in the destination
+        str       r5, [r0], #4
+        str       r6, [r0], #4
+        str       r7, [r0], #4
+        subs      r2, r2, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
diff --git a/source/NE10_setc.c b/source/NE10_setc.c
new file mode 100644
index 0000000..473675d
--- /dev/null
+++ b/source/NE10_setc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_setc.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t setc_float_c(arm_float_t * dst, const arm_float_t cst, unsigned int count)
+{
+  NE10_SETC_OPERATION_X_C
+  (
+    dst[itr] = cst;
+  );
+}
+
+arm_result_t setc_vec2f_c(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_SETC_OPERATION_X_C
+  (
+    dst[itr].x = cst->x;
+    dst[itr].y = cst->y;
+  );
+}
+
+arm_result_t setc_vec3f_c(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_SETC_OPERATION_X_C
+  (
+    dst[itr].x = cst->x;
+    dst[itr].y = cst->y;
+    dst[itr].z = cst->z;
+  );
+}
+
+arm_result_t setc_vec4f_c(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_SETC_OPERATION_X_C
+  (
+    dst[itr].x = cst->x;
+    dst[itr].y = cst->y;
+    dst[itr].z = cst->z;
+    dst[itr].w = cst->w;
+  );
+}
diff --git a/source/NE10_setc.neon.c b/source/NE10_setc.neon.c
new file mode 100644
index 0000000..d1ac583
--- /dev/null
+++ b/source/NE10_setc.neon.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_setc.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+arm_result_t setc_float_neon(arm_float_t * dst, const arm_float_t cst, unsigned int count)
+{
+    NE10_SETC_OPERATION_FLOAT_NEON
+    (
+        ;// The cst need not be altered
+        ,
+        ;// n_tmp_cst need not be altered
+    );
+}
+
+arm_result_t setc_vec2f_neon(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_SETC_OPERATION_VEC2F_NEON
+    (
+        ;// The cst need not be altered
+        ,
+        ;// n_tmp_cst need not be altered
+    );
+}
+
+arm_result_t setc_vec3f_neon(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_SETC_OPERATION_VEC3F_NEON
+    (
+        ;// cst1, cst2, and cst3 need not be altered
+        ,
+        ;// n_tmp_cst.val[0], .val[1], and .val[2] need not be altered
+     );
+}
+
+arm_result_t setc_vec4f_neon(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_SETC_OPERATION_VEC4F_NEON
+    (
+        ;// n_cst need not be altered
+    );
+}
diff --git a/source/NE10_setc_test.c b/source/NE10_setc_test.c
new file mode 100644
index 0000000..74698d1
--- /dev/null
+++ b/source/NE10_setc_test.c
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_setc_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_setc_operation_x.h"
+
+
+extern arm_result_t setc_float_c    (arm_float_t * dst, const arm_float_t cst, unsigned int count);
+extern arm_result_t setc_float_asm  (arm_float_t * dst, const arm_float_t cst, unsigned int count);
+extern arm_result_t setc_float_neon (arm_float_t * dst, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t setc_vec2f_c   (arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t setc_vec2f_asm (arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t setc_vec2f_neon(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
+
+extern arm_result_t setc_vec3f_c   (arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t setc_vec3f_asm (arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t setc_vec3f_neon(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
+
+extern arm_result_t setc_vec4f_c   (arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+extern arm_result_t setc_vec4f_asm (arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+extern arm_result_t setc_vec4f_neon(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
+
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_3args_t) setc_float_c;
+   ftbl[ 1] = (arm_func_3args_t) setc_float_asm;
+   ftbl[ 2] = (arm_func_3args_t) setc_float_neon;
+
+   ftbl[ 3] = (arm_func_3args_t) setc_vec2f_c;
+   ftbl[ 4] = (arm_func_3args_t) setc_vec2f_asm;
+   ftbl[ 5] = (arm_func_3args_t) setc_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_3args_t) setc_vec3f_c;
+   ftbl[ 7] = (arm_func_3args_t) setc_vec3f_asm;
+   ftbl[ 8] = (arm_func_3args_t) setc_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_3args_t) setc_vec4f_c;
+   ftbl[10] = (arm_func_3args_t) setc_vec4f_asm;
+   ftbl[11] = (arm_func_3args_t) setc_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_sub.asm.s b/source/NE10_sub.asm.s
new file mode 100644
index 0000000..2fd119b
--- /dev/null
+++ b/source/NE10_sub.asm.s
@@ -0,0 +1,61 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_sub.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   sub_float_asm
+        .thumb
+        .thumb_func
+
+sub_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t sub_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src1, const arm_float_t * src2,
+        @                 unsigned int count)
+        @
+        @  r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+        @  r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+        @  r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        cbz     r3, .LoopEndFloat
+
+.LoopBeginFloat:
+        vldr      s1, [r1]                @ Load s1 = src1[i]
+        add       r1, r1, #4              @ move to the next entry
+        vldr      s2, [r2]                @ Load s2 = src2[i]
+        add       r2, r2, #4              @ next entry
+        vsub.f32  s10, s1, s2             @ s10 = src1[i] - src2[i]
+        vstr      s10, [r0]               @ Store the result back into the main memory
+        add       r0, r0, #4              @ next entry in the dst
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        bx      lr
diff --git a/source/NE10_sub.c b/source/NE10_sub.c
new file mode 100644
index 0000000..ad3131d
--- /dev/null
+++ b/source/NE10_sub.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_sub.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t sub_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+  NE10_X_OPERATION_FLOAT_C
+  (
+    dst[ itr ] = src1[ itr ] - src2[ itr ];
+  );
+}
diff --git a/source/NE10_sub.neon.c b/source/NE10_sub.neon.c
new file mode 100644
index 0000000..52d29ff
--- /dev/null
+++ b/source/NE10_sub.neon.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_sub.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+arm_result_t sub_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
+{
+    NE10_X_OPERATION_FLOAT_NEON
+    (
+        n_dst = vsubq_f32( n_src , n_src2 );
+        ,
+        n_tmp_src = vsub_f32( n_tmp_src, n_tmp_src2 );
+    );
+}
diff --git a/source/NE10_sub_test.c b/source/NE10_sub_test.c
new file mode 100644
index 0000000..a1c6d86
--- /dev/null
+++ b/source/NE10_sub_test.c
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_sub_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 1
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_x_operation_x.h"
+
+
+extern arm_result_t sub_float_c    (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t sub_float_asm  (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+extern arm_result_t sub_float_neon (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
+
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) sub_float_c;
+   ftbl[ 1] = (arm_func_4args_t) sub_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) sub_float_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
diff --git a/source/NE10_subc.asm.s b/source/NE10_subc.asm.s
new file mode 100644
index 0000000..408cd87
--- /dev/null
+++ b/source/NE10_subc.asm.s
@@ -0,0 +1,233 @@
+@
+@  Copyright 2011-12 ARM Limited
+@
+@  Licensed under the Apache License, Version 2.0 (the "License");
+@  you may not use this file except in compliance with the License.
+@  You may obtain a copy of the License at
+@
+@      http://www.apache.org/licenses/LICENSE-2.0
+@
+@  Unless required by applicable law or agreed to in writing, software
+@  distributed under the License is distributed on an "AS IS" BASIS,
+@  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@  See the License for the specific language governing permissions and
+@  limitations under the License.
+@
+
+@
+@ NE10 Library : source/NE10_subc.asm.s
+@
+
+        .text
+        .syntax   unified
+
+.include "headers/NE10header.s"
+
+        .balign   4
+        .global   subc_float_asm
+        .thumb
+        .thumb_func
+
+subc_float_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t subc_float(arm_vec2f_t * dst,
+        @                 arm_float_t * src, const arm_float_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndFloat
+        mov     r5, #0
+
+.LoopBeginFloat:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i]
+        vmov      s3, r2                  @ Get cst into register s3
+        vsub.f32  s10, s1, s3             @ s10 = src[i] - cst
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the result back into the main memory
+        add       r5, r5, #4              @ increase the offset by 1*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginFloat        @ Continue if  "i < count"
+
+.LoopEndFloat:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   subc_vec2f_asm
+        .thumb
+        .thumb_func
+
+subc_vec2f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t subc_vec2f(arm_vec2f_t * dst,
+        @                 arm_vec2f_t * src, const arm_vec2f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec2F
+        mov     r5, #0
+
+.LoopBeginVec2F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x and src[i].y
+        vldr      s2, [r6, #4]
+        vldr      s3, [r2, #0]            @ Load cst->x and cst->y
+        vldr      s4, [r2, #4]
+        vsub.f32  s10, s1, s3             @ s10 = src[i].x - cst->x
+        vsub.f32  s11, s2, s4             @ s11 = src[i].y - cst->y
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        add       r5, r5, #8              @ increase the offset by 2*sizeof(float) @@ (for x and y)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec2F        @ Continue if  "i < count"
+
+.LoopEndVec2F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   subc_vec3f_asm
+        .thumb
+        .thumb_func
+
+subc_vec3f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t subc_vec3f(arm_vec3f_t * dst,
+        @                 arm_vec3f_t * src, const arm_vec3f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec3F
+        mov     r5, #0
+
+.LoopBeginVec3F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , and src[i].z
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r2, #0]            @ Load cst->x, cst->y, and cst->z
+        vldr      s5, [r2, #4]
+        vldr      s6, [r2, #8]
+        vsub.f32  s10, s1, s4             @ s10 = src[i].x - cst->x
+        vsub.f32  s11, s2, s5             @ s11 = src[i].y - cst->y
+        vsub.f32  s12, s3, s6             @ s12 = src[i].z - cst->z
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        add       r5, r5, #12             @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec3F        @ Continue if  "i < count"
+
+.LoopEndVec3F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
+
+
+
+
+        .balign   4
+        .global   subc_vec4f_asm
+        .thumb
+        .thumb_func
+
+subc_vec4f_asm:
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+        @
+        @ arm_result_t subc_vec4f(arm_vec4f_t * dst,
+        @                 arm_vec4f_t * src, const arm_vec4f_t * cst,
+        @                 unsigned int count)
+        @
+        @  r0: *dst
+        @  r1: *src
+        @  r2: *cst
+        @  r3: int count
+        @
+        @  r3: loop counter
+        @  r5: current item's offset in both src[] and dst[]
+        @  r6: current source item's address made of base(r1)+offset(r5)
+        @  r7: current destination item's address made of base(r0)+offset(r5)
+        @
+        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        push    {r4, r5, r6, r7}
+        cbz     r3, .LoopEndVec4F
+        mov     r5, #0
+
+.LoopBeginVec4F:
+        add       r6, r1, r5              @ Get current source item's address in memory
+        vldr      s1, [r6, #0]            @ Load src[i].x, src[i].y , src[i].z, and w
+        vldr      s2, [r6, #4]
+        vldr      s3, [r6, #8]
+        vldr      s4, [r6, #12]
+        vldr      s5, [r2, #0]            @ Load cst->x, cst->y, cst->z, and w
+        vldr      s6, [r2, #4]
+        vldr      s7, [r2, #8]
+        vldr      s8, [r2, #12]
+        vsub.f32  s10, s1, s5             @ s10 = src[i].x - cst->x
+        vsub.f32  s11, s2, s6             @ s11 = src[i].y - cst->y
+        vsub.f32  s12, s3, s7             @ s12 = src[i].z - cst->z
+        vsub.f32  s13, s4, s8             @ s13 = src[i].w - cst->w
+        add       r7, r0, r5              @ Get current destination item's address in memory
+        vstr      s10, [r7, #0]           @ Store the results back into the main memory
+        vstr      s11, [r7, #4]
+        vstr      s12, [r7, #8]
+        vstr      s13, [r7, #12]
+        add       r5, r5, #16             @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+        subs      r3, r3, #1              @ count down using the current index (i--)
+        bne        .LoopBeginVec4F        @ Continue if  "i < count"
+
+.LoopEndVec4F:
+        mov     r0, NE10_OK             @ Return NE10_OK
+        pop     {r4, r5, r6, r7}
+        bx      lr
diff --git a/source/NE10_subc.c b/source/NE10_subc.c
new file mode 100644
index 0000000..25a46be
--- /dev/null
+++ b/source/NE10_subc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_subc.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+
+arm_result_t subc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ] = src[ itr ] - cst;
+  );
+}
+
+arm_result_t subc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x - cst->x;
+    dst[ itr ].y = src[ itr ].y - cst->y;
+  );
+}
+
+arm_result_t subc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x - cst->x;
+    dst[ itr ].y = src[ itr ].y - cst->y;
+    dst[ itr ].z = src[ itr ].z - cst->z;
+  );
+}
+
+arm_result_t subc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+  NE10_XC_OPERATION_X_C
+  (
+    dst[ itr ].x = src[ itr ].x - cst->x;
+    dst[ itr ].y = src[ itr ].y - cst->y;
+    dst[ itr ].z = src[ itr ].z - cst->z;
+    dst[ itr ].w = src[ itr ].w - cst->w;
+  );
+}
diff --git a/source/NE10_subc.neon.c b/source/NE10_subc.neon.c
new file mode 100644
index 0000000..a88d26f
--- /dev/null
+++ b/source/NE10_subc.neon.c
@@ -0,0 +1,68 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_subc.neon.c
+ */
+
+#include "NE10.h"
+#include "../headers/macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+arm_result_t subc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
+{
+    NE10_XC_OPERATION_FLOAT_NEON
+    (
+        n_dst = vsubq_f32( n_src , n_cst );
+        ,
+        n_tmp_src = vsub_f32( n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t subc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC2F_NEON
+    (
+       n_dst = vsubq_f32( n_src , n_cst );
+       ,
+       n_tmp_src = vsub_f32( n_tmp_src, n_tmp_cst );
+    );
+}
+
+arm_result_t subc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC3F_NEON
+    (
+        n_dst1 = vsubq_f32( n_src1 , n_cst1 );
+        n_dst2 = vsubq_f32( n_src2 , n_cst2 );
+        n_dst3 = vsubq_f32( n_src3 , n_cst3 );
+        ,
+        n_tmp_src.val[0] = vsub_f32( n_tmp_src.val[0], n_tmp_cst.val[0] );
+        n_tmp_src.val[1] = vsub_f32( n_tmp_src.val[1], n_tmp_cst.val[1] );
+        n_tmp_src.val[2] = vsub_f32( n_tmp_src.val[2], n_tmp_cst.val[2] );
+     );
+}
+
+arm_result_t subc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
+{
+    NE10_XC_OPERATION_VEC4F_NEON
+    (
+        n_dst = vsubq_f32( n_src , n_cst );
+    );
+}
diff --git a/source/NE10_subc_test.c b/source/NE10_subc_test.c
new file mode 100644
index 0000000..d45067b
--- /dev/null
+++ b/source/NE10_subc_test.c
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2011-12 ARM Limited
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * NE10 Library : source/NE10_subc_test.c
+ */
+
+//Make sure the following are defined before including "unit_test.h"
+
+// length of the data arrays
+#define ARRLEN TEST_ARRLEN
+// number of the operations in a given unit
+#define OP_COUNT 4
+// number of the different implementations of each of the functions (C, ASM, NEON, ...)
+#define IMPL_COUNT 3
+
+
+#include "../headers/unit_test_xc_operation_x.h"
+
+
+extern arm_result_t subc_float_c    (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t subc_float_asm  (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+extern arm_result_t subc_float_neon (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
+
+extern arm_result_t subc_vec2f_c   (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t subc_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+extern arm_result_t subc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
+
+extern arm_result_t subc_vec3f_c   (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t subc_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+extern arm_result_t subc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
+
+extern arm_result_t subc_vec4f_c   (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+extern arm_result_t subc_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+extern arm_result_t subc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
+
+
+void init_ftbl()
+{
+   // manually initialize the global function table with
+   //  those functions that do have an actual implementation.
+   ftbl[ 0] = (arm_func_4args_t) subc_float_c;
+   ftbl[ 1] = (arm_func_4args_t) subc_float_asm;
+   ftbl[ 2] = (arm_func_4args_t) subc_float_neon;
+
+   ftbl[ 3] = (arm_func_4args_t) subc_vec2f_c;
+   ftbl[ 4] = (arm_func_4args_t) subc_vec2f_asm;
+   ftbl[ 5] = (arm_func_4args_t) subc_vec2f_neon;
+
+   ftbl[ 6] = (arm_func_4args_t) subc_vec3f_c;
+   ftbl[ 7] = (arm_func_4args_t) subc_vec3f_asm;
+   ftbl[ 8] = (arm_func_4args_t) subc_vec3f_neon;
+
+   ftbl[ 9] = (arm_func_4args_t) subc_vec4f_c;
+   ftbl[10] = (arm_func_4args_t) subc_vec4f_asm;
+   ftbl[11] = (arm_func_4args_t) subc_vec4f_neon;
+}
+
+arm_result_t main( int argc, char **argv )
+{
+   return run_test( argc, argv ); // defined in "unit_test.h"
+}
-- 
2.7.4