+++ /dev/null
-LOCAL_PATH:= $(call my-dir)
-
-ne10_neon_source := \
- source/NE10_abs.neon.s \
- source/NE10_addc.neon.c \
- source/NE10_addmat.neon.c \
- source/NE10_add.neon.s \
- source/NE10_cross.neon.s \
- source/NE10_detmat.neon.s \
- source/NE10_divc.neon.c \
- source/NE10_div.neon.s \
- source/NE10_dot.neon.s \
- source/NE10_identitymat.neon.s \
- source/NE10_invmat.neon.s \
- source/NE10_len.neon.s \
- source/NE10_mla.neon.s \
- source/NE10_mlac.neon.c \
- source/NE10_mulcmatvec.neon.s \
- source/NE10_mulc.neon.c \
- source/NE10_mulmat.neon.s \
- source/NE10_mul.neon.c \
- source/NE10_normalize.neon.s \
- source/NE10_rsbc.neon.c \
- source/NE10_setc.neon.c \
- source/NE10_subc.neon.c \
- source/NE10_submat.neon.c \
- source/NE10_sub.neon.s \
- source/NE10_transmat.neon.s \
-
-ne10_source_files := \
- source/NE10_abs.asm.s \
- source/NE10_addc.asm.s \
- source/NE10_addmat.asm.s \
- source/NE10_add.asm.s \
- source/NE10_cross.asm.s \
- source/NE10_detmat.asm.s \
- source/NE10_divc.asm.s \
- source/NE10_div.asm.s \
- source/NE10_dot.asm.s \
- source/NE10_identitymat.asm.s \
- source/NE10_invmat.asm.s \
- source/NE10_len.asm.s \
- source/NE10_mla.asm.s \
- source/NE10_mlac.asm.s \
- source/NE10_mulcmatvec.asm.s \
- source/NE10_mulc.asm.s \
- source/NE10_mulmat.asm.s \
- source/NE10_mul.asm.s \
- source/NE10_normalize.asm.s \
- source/NE10_rsbc.asm.s \
- source/NE10_setc.asm.s \
- source/NE10_subc.asm.s \
- source/NE10_submat.asm.s \
- source/NE10_sub.asm.s \
- source/NE10_transmat.asm.s \
- source/NE10_abs.c \
- source/NE10_addc.c \
- source/NE10_addmat.c \
- source/NE10_add.c \
- source/NE10_cross.c \
- source/NE10_detmat.c \
- source/NE10_divc.c \
- source/NE10_div.c \
- source/NE10_dot.c \
- source/NE10_identitymat.c \
- source/NE10_invmat.c \
- source/NE10_len.c \
- source/NE10_mla.c \
- source/NE10_mlac.c \
- source/NE10_mulcmatvec.c \
- source/NE10_mulc.c \
- source/NE10_mulmat.c \
- source/NE10_mul.c \
- source/NE10_normalize.c \
- source/NE10_rsbc.c \
- source/NE10_setc.c \
- source/NE10_subc.c \
- source/NE10_submat.c \
- source/NE10_sub.c \
- source/NE10_transmat.c \
-
-include $(CLEAR_VARS)
-
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/headers/ \
- $(LOCAL_PATH)/inc
-
-LOCAL_SRC_FILES := \
- $(ne10_source_files)
-
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-LOCAL_SRC_FILES += $(ne10_neon_source)
-endif
-
-LOCAL_CFLAGS := -D_ARM_ASSEM_
-
-LOCAL_ARM_MODE := arm
-
-LOCAL_MODULE_TAGS := eng
-LOCAL_MODULE := libne10
-
-include $(BUILD_STATIC_LIBRARY)
-
-include $(CLEAR_VARS)
-
-LOCAL_CPP_EXTENSION := .cc
-
-LOCAL_CFLAGS := -D_ARM_ASSEM_
-
-LOCAL_ARM_MODE := arm
-
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/headers/ \
- $(LOCAL_PATH)/inc
-
-LOCAL_SRC_FILES := \
- $(ne10_source_files)
-
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-LOCAL_SRC_FILES += $(ne10_neon_source)
-endif
-
-LOCAL_MODULE_TAGS := eng
-LOCAL_MODULE := libne10
-
-include $(BUILD_SHARED_LIBRARY)
-
-
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : CMakeBuilding.txt
- */
-
-=========================BUILDING METHOD=================================
-
-NE10 uses CMake to describe the build in a platform independent manner.
-
-First download and install cmake from cmake.org.
-In Ubuntu, you can install cmake by "sudo apt-get install cmake"
-
----------------------------NATIVE-COMPILING------------------------------
-For Unix platforms, say the following on a terminal: (Replace $NE10PATH with the directory where this file is located.)
- cd $NE10PATH
- mkdir build && cd build
- cmake ..
- make
-Then the libNE10.a is placed in ./source/ and a test program "NE10_test_static" is placed in ./test/. you can run it.
-You might want to add -DNE10_BUILD_SHARED=ON to the cmake call to generate the dynamic library and test program "NE10_test_dynamic".
-
----------------------------CROSS-COMPILING------------------------------
-For cross-compiling, the process is in the following:
- cd $NE10PATH
-
-Open the config.cmake and change the compiler toolchain to yourself.My toolchain is Linaro GCC 4.6.
-In Ubuntu 11.10 you can install it by "sudo apt-get install gcc-arm-linux-gnueabi".
- set( CMAKE_C_COMPILER arm-linux-gnueabi-gcc )
- set( CMAKE_CXX_COMPILER arm-linux-gnueabi-g++ )
- set( CMAKE_ASM_COMPILER arm-linux-gnueabi-as )
-
-Now you can run cmake to process and generate makefile.
- mkdir build && cd build
- cmake -DCMAKE_TOOLCHAIN_FILE=../config.cmake ..
- make
-
-Then the libNE10.a is placed in ./source/ and a test program "NE10_test_static" is placed in ./test/. you can copy these to the target and run it.
-You might want to add -DNE10_BUILD_SHARED=ON to the cmake call to generate the dynamic library and test program "NE10_test_dynamic".
-
-Note:
-When you run NE10_test_dynamic on the target, you might meet the error:
- "NE10_test_dynamic: error while loading shared libraries: libNE10_shared.so.10: cannot open shared object file: No such file or directory"
-You can run the following command:
- export LD_LIBRARY_PATH=$NE10PATH/build/source
-
---------------------------------END--------------------------------------
option(NE10_BUILD_STATIC "Build NE10 static libraries" ON)
option(NE10_BUILD_EXAMPLES "Build NE10 examples" ON)
+#unit test options
+option(NE10_BUILD_UNIT_TEST "Build NE10 unit test" OFF)
+if (NE10_BUILD_UNIT_TEST)
+ #decide the test is smoke, regression or performance test, only one of three options is ON!
+ option(NE10_SMOKE_TEST "Run smoke test" OFF)
+ option(NE10_REGRESSION_TEST "Run regression test" OFF)
+ option(NE10_PERFORMANCE_TEST "Run performance test" OFF)
+
+ option(NE10_DEBUG_TRACE "Print debug trace" OFF)
+endif()
+
+#select functionalities to be compiled
+option(NE10_ENABLE_MATH "Build math functionalities to NE10" ON)
+
set(NE10_VERSION 10)
# set complile flags for ARM.
set( CMAKE_ASM_FLAGS "-mthumb-interwork -march=armv7-a -mcpu=cortex-a9 -mfpu=neon" )
# The NE10 library.
-add_subdirectory(source)
+add_subdirectory(modules)
if(NE10_BUILD_EXAMPLES)
- add_subdirectory(test)
+ add_subdirectory(samples)
endif()
+if(NE10_BUILD_UNIT_TEST)
+ add_subdirectory(test)
+endif()
+++ /dev/null
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
+++ /dev/null
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-C_TOOL = gcc
-EXE_TOOL = gcc
-ASM_TOOL = as
-
-#BJ_FLAGS = -mthumb-interwork -march=armv7-a -mcpu=cortex-a9 -mfpu=vfp3
-ARM_FLAGS = -mthumb-interwork -march=armv7-a -mcpu=cortex-a9
-C_FLAGS = -lm -lrt -I./inc/
-#DEBUG_FLAGS = -gstabs
-OPTIMIZE_FLAGS = -O3
-# -save-temps -O3
-
-LDFLAGS+=-L. -L/usr/local/lib -L/client/lib -L/lib/arm-linux-gnueabi
-LDFLAGS+=-lm
-
-ALLFILES = \
- NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o \
- NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o \
- NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o \
- NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o \
- NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o \
- NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o \
- NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o \
- NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o \
- NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o \
- NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o \
- NE10_dot.c_r.o NE10_dot.neon_r.o NE10_cross.c_r.o \
- NE10_cross.neon_r.o NE10_addmat.c_r.o NE10_addmat.neon_r.o \
- NE10_submat.c_r.o NE10_submat.neon_r.o NE10_mulmat.c_r.o \
- NE10_mulmat.neon_r.o NE10_mulcmatvec.c_r.o NE10_mulcmatvec.neon_r.o \
- NE10_detmat.c_r.o NE10_detmat.neon_r.o NE10_invmat.c_r.o \
- NE10_invmat.neon_r.o NE10_transmat.c_r.o NE10_transmat.neon_r.o \
- NE10_identitymat.c_r.o NE10_identitymat.neon_r.o
-
-#TARGET_ARCH = stdc
-
-.PHONY: all clean
-
-all: NE10_test_static.ex NE10_test_dynamic.ex
-
-clean:
- ./cleanall.sh
-
-NE10_test_static.ex : libNE10.a NE10_init.h NE10_test.c
- $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) ./NE10_init.c ./NE10_test.c -o $@ -l:libNE10.a $(C_FLAGS) -L/lib/arm-linux-gnueabi
-
-NE10_test_dynamic.ex : libNE10.so NE10_init.h NE10_test.c
- $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) ./NE10_init.c ./NE10_test.c -o $@ -l:libNE10.so $(C_FLAGS) -L/lib/arm-linux-gnueabi
-
-libNE10.a : $(ALLFILES) NE10_init.h NE10_init.c
- ar rcs libNE10.a $(ALLFILES)
-
-libNE10.so : $(ALLFILES) NE10_init.h NE10_init.c
- gcc -shared -o $@ $(C_FLAGS) $(ALLFILES)
-
-%mat.test_r.ex : %.asm_r.o %.c_r.o %.neon_r.o %mat.c_r.o %mat.neon_r.o ./source/%mat_test.c ./inc/NE10.h
- $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) $^ -o $@ $(C_FLAGS) -L/lib/arm-linux-gnueabi
-
-%.test_r.ex : %.asm_r.o %.c_r.o %.neon_r.o ./source/%_test.c ./inc/NE10.h
- $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) $^ -o $@ $(C_FLAGS) -L/lib/arm-linux-gnueabi
-
-%.c_r.o : ./source/%.c ./inc/NE10.h
- $(C_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) -mfpu=vfp3 -c $< -o $@ $(C_FLAGS) -L/lib/arm-linux-gnueabi
-
-%.asm_r.o : ./source/%.asm.s
- $(ASM_TOOL) $(ARM_FLAGS) -mfpu=vfp3 $< -o $@
-
-# Either use the C version or use the Assembly version for compiling the NEON routines
-
-# Rules for the Assembly version
-%.neon_r.o : ./source/%.neon.s
- $(ASM_TOOL) $(ARM_FLAGS) -mfpu=neon $< -o $@
-
-# Rules for the C version
-%.neon_r.o : ./source/%.neon.c ./inc/NE10.h
- $(C_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) -mfpu=neon -c $< -o $@ $(C_FLAGS)
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "NE10.h"
-
-#include <stdio.h>
-
-#define CPUINFO_BUFFER_SIZE (1024*4)
-
-// This local variable indicates whether or not the running platform supports ARM NEON
-arm_result_t is_NEON_available = NE10_ERR;
-
-arm_result_t NE10_HasNEON()
-{
- return is_NEON_available;
-}
-
-arm_result_t NE10_init()
-{
- FILE* infofile = NULL; // To open the file /proc/cpuinfo
- char cpuinfo[CPUINFO_BUFFER_SIZE]; // The buffer to read in the string
- size_t bytes = 0; // Numbers of bytes read from the file
- int i = 0; // Temporary loop counter
-
- memset( cpuinfo, 0, CPUINFO_BUFFER_SIZE );
- infofile = fopen( "/proc/cpuinfo", "r" );
- bytes = fread( cpuinfo, 1, sizeof(cpuinfo), infofile );
- fclose( infofile );
-
- if( 0 == bytes || CPUINFO_BUFFER_SIZE == bytes )
- {
- fprintf( stderr, "ERROR: Couldn't read the file \"/proc/cpuinfo\". NE10_init() failed.\n");
- return NE10_ERR;
- }
-
- while( '\0' != cpuinfo[i] ) cpuinfo[i++] = (char)tolower(cpuinfo[i]);
-
- if ( 0 != strstr(cpuinfo, "neon") )
- {
- is_NEON_available = NE10_OK;
- }
-
- if ( NE10_OK == NE10_HasNEON() )
- {
- addc_float = addc_float_neon;
- addc_vec2f = addc_vec2f_neon;
- addc_vec3f = addc_vec3f_neon;
- addc_vec4f = addc_vec4f_neon;
- subc_float = subc_float_neon;
- subc_vec2f = subc_vec2f_neon;
- subc_vec3f = subc_vec3f_neon;
- subc_vec4f = subc_vec4f_neon;
- rsbc_float = rsbc_float_neon;
- rsbc_vec2f = rsbc_vec2f_neon;
- rsbc_vec3f = rsbc_vec3f_neon;
- rsbc_vec4f = rsbc_vec4f_neon;
- mulc_float = mulc_float_neon;
- mulc_vec2f = mulc_vec2f_neon;
- mulc_vec3f = mulc_vec3f_neon;
- mulc_vec4f = mulc_vec4f_neon;
- divc_float = divc_float_neon;
- divc_vec2f = divc_vec2f_neon;
- divc_vec3f = divc_vec3f_neon;
- divc_vec4f = divc_vec4f_neon;
- setc_float = setc_float_neon;
- setc_vec2f = setc_vec2f_neon;
- setc_vec3f = setc_vec3f_neon;
- setc_vec4f = setc_vec4f_neon;
- mlac_float = mlac_float_neon;
- mlac_vec2f = mlac_vec2f_neon;
- mlac_vec3f = mlac_vec3f_neon;
- mlac_vec4f = mlac_vec4f_neon;
- add_float = add_float_neon;
- sub_float = sub_float_neon;
- mul_float = mul_float_neon;
- div_float = div_float_neon;
- mla_float = mla_float_neon;
- abs_float = abs_float_neon;
- len_vec2f = len_vec2f_neon;
- len_vec3f = len_vec3f_neon;
- len_vec4f = len_vec4f_neon;
- normalize_vec2f = normalize_vec2f_neon;
- normalize_vec3f = normalize_vec3f_neon;
- normalize_vec4f = normalize_vec4f_neon;
-
- abs_vec2f = abs_vec2f_neon;
- abs_vec3f = abs_vec3f_neon;
- abs_vec4f = abs_vec4f_neon;
- vmul_vec2f = vmul_vec2f_neon;
- vmul_vec3f = vmul_vec3f_neon;
- vmul_vec4f = vmul_vec4f_neon;
- vdiv_vec2f = vdiv_vec2f_neon;
- vdiv_vec3f = vdiv_vec3f_neon;
- vdiv_vec4f = vdiv_vec4f_neon;
- vmla_vec2f = vmla_vec2f_neon;
- vmla_vec3f = vmla_vec3f_neon;
- vmla_vec4f = vmla_vec4f_neon;
- add_vec2f = add_vec2f_neon;
- add_vec3f = add_vec3f_neon;
- add_vec4f = add_vec4f_neon;
- sub_vec2f = sub_vec2f_neon;
- sub_vec3f = sub_vec3f_neon;
- sub_vec4f = sub_vec4f_neon;
- dot_vec2f = dot_vec2f_neon;
- dot_vec3f = dot_vec3f_neon;
- dot_vec4f = dot_vec4f_neon;
- cross_vec3f = cross_vec3f_neon;
-
- addmat_2x2f = addmat_2x2f_neon;
- addmat_3x3f = addmat_3x3f_neon;
- addmat_4x4f = addmat_4x4f_neon;
- submat_2x2f = submat_2x2f_neon;
- submat_3x3f = submat_3x3f_neon;
- submat_4x4f = submat_4x4f_neon;
- mulmat_2x2f = mulmat_2x2f_neon;
- mulmat_3x3f = mulmat_3x3f_neon;
- mulmat_4x4f = mulmat_4x4f_neon;
- mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_neon;
- mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_neon;
- mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_neon;
- detmat_2x2f = detmat_2x2f_neon;
- detmat_3x3f = detmat_3x3f_neon;
- detmat_4x4f = detmat_4x4f_neon;
- invmat_2x2f = invmat_2x2f_neon;
- invmat_3x3f = invmat_3x3f_neon;
- invmat_4x4f = invmat_4x4f_neon;
- transmat_4x4f = transmat_4x4f_neon;
- identitymat_4x4f = identitymat_4x4f_neon;
- transmat_3x3f = transmat_3x3f_neon;
- identitymat_3x3f = identitymat_3x3f_neon;
- transmat_2x2f = transmat_2x2f_neon;
- identitymat_2x2f = identitymat_2x2f_neon;
- }
- else
- {
- addc_float = addc_float_c;
- addc_vec2f = addc_vec2f_c;
- addc_vec3f = addc_vec3f_c;
- addc_vec4f = addc_vec4f_c;
- subc_float = subc_float_c;
- subc_vec2f = subc_vec2f_c;
- subc_vec3f = subc_vec3f_c;
- subc_vec4f = subc_vec4f_c;
- rsbc_float = rsbc_float_c;
- rsbc_vec2f = rsbc_vec2f_c;
- rsbc_vec3f = rsbc_vec3f_c;
- rsbc_vec4f = rsbc_vec4f_c;
- mulc_float = mulc_float_c;
- mulc_vec2f = mulc_vec2f_c;
- mulc_vec3f = mulc_vec3f_c;
- mulc_vec4f = mulc_vec4f_c;
- divc_float = divc_float_c;
- divc_vec2f = divc_vec2f_c;
- divc_vec3f = divc_vec3f_c;
- divc_vec4f = divc_vec4f_c;
- setc_float = setc_float_c;
- setc_vec2f = setc_vec2f_c;
- setc_vec3f = setc_vec3f_c;
- setc_vec4f = setc_vec4f_c;
- mlac_float = mlac_float_c;
- mlac_vec2f = mlac_vec2f_c;
- mlac_vec3f = mlac_vec3f_c;
- mlac_vec4f = mlac_vec4f_c;
- add_float = add_float_c;
- sub_float = sub_float_c;
- mul_float = mul_float_c;
- div_float = div_float_c;
- mla_float = mla_float_c;
- abs_float = abs_float_c;
- len_vec2f = len_vec2f_c;
- len_vec3f = len_vec3f_c;
- len_vec4f = len_vec4f_c;
- normalize_vec2f = normalize_vec2f_c;
- normalize_vec3f = normalize_vec3f_c;
- normalize_vec4f = normalize_vec4f_c;
-
- abs_vec2f = abs_vec2f_c;
- abs_vec3f = abs_vec3f_c;
- abs_vec4f = abs_vec4f_c;
- vmul_vec2f = vmul_vec2f_c;
- vmul_vec3f = vmul_vec3f_c;
- vmul_vec4f = vmul_vec4f_c;
- vdiv_vec2f = vdiv_vec2f_c;
- vdiv_vec3f = vdiv_vec3f_c;
- vdiv_vec4f = vdiv_vec4f_c;
- vmla_vec2f = vmla_vec2f_c;
- vmla_vec3f = vmla_vec3f_c;
- vmla_vec4f = vmla_vec4f_c;
- add_vec2f = add_vec2f_c;
- add_vec3f = add_vec3f_c;
- add_vec4f = add_vec4f_c;
- sub_vec2f = sub_vec2f_c;
- sub_vec3f = sub_vec3f_c;
- sub_vec4f = sub_vec4f_c;
- dot_vec2f = dot_vec2f_c;
- dot_vec3f = dot_vec3f_c;
- dot_vec4f = dot_vec4f_c;
- cross_vec3f = cross_vec3f_c;
-
- addmat_2x2f = addmat_2x2f_c;
- addmat_3x3f = addmat_3x3f_c;
- addmat_4x4f = addmat_4x4f_c;
- submat_2x2f = submat_2x2f_c;
- submat_3x3f = submat_3x3f_c;
- submat_4x4f = submat_4x4f_c;
- mulmat_2x2f = mulmat_2x2f_c;
- mulmat_3x3f = mulmat_3x3f_c;
- mulmat_4x4f = mulmat_4x4f_c;
- mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_c;
- mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_c;
- mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_c;
- detmat_2x2f = detmat_2x2f_c;
- detmat_3x3f = detmat_3x3f_c;
- detmat_4x4f = detmat_4x4f_c;
- invmat_2x2f = invmat_2x2f_c;
- invmat_3x3f = invmat_3x3f_c;
- invmat_4x4f = invmat_4x4f_c;
- transmat_4x4f = transmat_4x4f_c;
- identitymat_4x4f = identitymat_4x4f_c;
- transmat_3x3f = transmat_3x3f_c;
- identitymat_3x3f = identitymat_3x3f_c;
- transmat_2x2f = transmat_2x2f_c;
- identitymat_2x2f = identitymat_2x2f_c;
- }
-}
-
-// These are actual definitions of our function pointers that are declared in inc/NE10.h
-arm_result_t (*addc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-arm_result_t (*addc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*addc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*addc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*subc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-arm_result_t (*subc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*subc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*subc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*rsbc_float)(arm_float_t * dst, arm_float_t *src, const arm_float_t cst, unsigned int count);
-arm_result_t (*rsbc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*rsbc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*rsbc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*mulc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-arm_result_t (*mulc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*mulc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*mulc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*divc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-arm_result_t (*divc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*divc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*divc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*setc_float)(arm_float_t * dst, const arm_float_t cst, unsigned int count);
-arm_result_t (*setc_vec2f)(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*setc_vec3f)(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*setc_vec4f)(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*mlac_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
-arm_result_t (*mlac_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-arm_result_t (*mlac_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-arm_result_t (*mlac_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-arm_result_t (*add_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-arm_result_t (*sub_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-arm_result_t (*mul_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-arm_result_t (*div_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-arm_result_t (*mla_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-arm_result_t (*abs_float)(arm_float_t * dst, arm_float_t * src, unsigned int count);
-arm_result_t (*len_vec2f)(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
-arm_result_t (*len_vec3f)(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
-arm_result_t (*len_vec4f)(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
-arm_result_t (*normalize_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-arm_result_t (*normalize_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-arm_result_t (*normalize_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-arm_result_t (*abs_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-arm_result_t (*abs_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-arm_result_t (*abs_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-arm_result_t (*vmul_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t (*vmul_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t (*vmul_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t (*vdiv_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t (*vdiv_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t (*vdiv_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t (*vmla_vec2f)(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t (*vmla_vec3f)(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t (*vmla_vec4f)(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t (*add_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t (*add_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t (*add_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t (*sub_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t (*sub_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t (*sub_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t (*dot_vec2f)(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t (*dot_vec3f)(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t (*dot_vec4f)(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t (*cross_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-arm_result_t (*addmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-arm_result_t (*addmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-arm_result_t (*addmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-arm_result_t (*submat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-arm_result_t (*submat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-arm_result_t (*submat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-arm_result_t (*mulmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-arm_result_t (*mulmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-arm_result_t (*mulmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-arm_result_t (*mulcmatvec_cm4x4f_v4f)(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-arm_result_t (*mulcmatvec_cm3x3f_v3f)(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-arm_result_t (*mulcmatvec_cm2x2f_v2f)(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-arm_result_t (*detmat_4x4f)(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-arm_result_t (*detmat_3x3f)(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-arm_result_t (*detmat_2x2f)(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-arm_result_t (*invmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-arm_result_t (*invmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-arm_result_t (*invmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-arm_result_t (*transmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-arm_result_t (*identitymat_4x4f)(arm_mat4x4f_t * dst, unsigned int count);
-arm_result_t (*transmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-arm_result_t (*identitymat_3x3f)(arm_mat3x3f_t * dst, unsigned int count);
-arm_result_t (*transmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-arm_result_t (*identitymat_2x2f)(arm_mat2x2f_t * dst, unsigned int count);
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <NE10.h>
-
-#ifndef NE10_init_H
-#define NE10_init_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!
- This routine returns NE10_OK if the running platform supports NEON, otherwise it returns NE10_ERR
- */
-extern arm_result_t NE10_HasNEON();
-
-/*!
- This routine initializes all the function pointers defined in "NE10.h" with pointers to ARM NEON or ARM VFP implementations.
- */
-extern arm_result_t NE10_init();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "./inc/NE10.h"
-#include "./NE10_init.h"
-
-// This test code shows you how you can statically embed NE10 in your code
-
-void main()
-{
- printf ( "Going to initialze NE10...\n" );
-
- NE10_init();
-
- printf ( "NE10 has been initialized.\n" );
-
-}
-
+++ /dev/null
-NE10 Library
-Copyright 2011-12 ARM Limited
-
-This product was produced by ARM Limited.
(See LICENSE for details)
+Build
+=====
+
+See CMakeBuilding.txt file in the "doc" folder
+
Usage
=====
-See USAGE.txt file
+See USAGE.txt file in the "doc" folder
+
+Code formatter
+=====
+See Formatter.txt file in the "doc" folder
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : ReleaseNote.txt
- */
-NE10 SIMD LIBRARY - Release Note
-LAST UPDATED ON: 10 / APR / 2012
-
-========
-Contents
-========
-
- 1. Preface
- 1-a. License
- 1-b. Product status
- 2. Release details
- 2-a. Product release status
- 2-b. Functions included
- 2-c. Test cases and results
- 3. Installation
- 3-a. Requirements
- 3-b. Alternative Approach
- 4. Changelog
- 4-a. r1.0_beta
-
-
-==========
-1. Preface
-==========
-
-1-a. License details
---------------------
-NE10 is an open source project released under the Apache License,
-Version 2.0 (ALv2). See the file LICENSE for the full text of the ALv2.
-
-
-1-b. Product status
--------------------
-This is the first publicly available version of NE10. This open source project
-is actively under development and more functions as well as improved versions of
-the available functions will be contributed to the source code.
-
-
-==================
-2. Release details
-==================
-
-2-a. Product release status
----------------------------
-Version 1.0 beta
-
-The set of functions planned for this release are in place. However some issues
-remain where their intended behaviour diverges from the planned specification:
-
-In the release version, unless impractical and explicitly stated, all functions
-will operate correctly when the output area of the result is the same as one
-of the input areas. (ie. where the src1 or src2 parameter == the dst parameter)
-
-In this beta release that behaviour cannot be assumed.
-
-2-b. Functions included
----------------------------
-NE10 is a software library that provides Linux and Android support for Single
-Instruction Multiple Data (SIMD) functionality. In this release, a number of
-mathematical functions (mainly vector and scalar operations) have been
-implemented for the ARM v7 instruction set architecture as well as ARM NEON
-SIMD architecture extensions.
-
-This library has been developed and tested on the following processors:
-
- 1) ARM Cortex-A9 with NEON extension
- 2) ARM Cortex-A8 with NEON extension
-
-The following is a list of currently available functions.
-
- a) Vector-Constant Arithmetic
-
- addc_float, addc_vec2f, addc_vec3f, addc_vec4f,
- subc_float, subc_vec2f, subc_vec3f, subc_vec4f,
- rsbc_float, rsbc_vec2f, rsbc_vec3f, rsbc_vec4f,
- mulc_float, mulc_vec2f, mulc_vec3f, mulc_vec4f,
- divc_float, divc_vec2f, divc_vec3f, divc_vec4f,
- setc_float, setc_vec2f, setc_vec3f, setc_vec4f,
- mlac_float, mlac_vec2f, mlac_vec3f, mlac_vec4f
-
- b) Arithmetic functions over arrays of cst values:
-
- add_float, sub_float, mul_float, div_float, mla_float, abs_float
-
- c) Operations on Vectors:
-
- abs_vec2f, abs_vec3f, abs_vec4f,
- addc_vec2f, addc_vec3f, addc_vec4f,
- add_vec2f, add_vec3f, add_vec4f,
- divc_vec2f, divc_vec3f, divc_vec4f,
- dot_vec2f, dot_vec3f, dot_vec4f
- len_vec2f, len_vec3f, len_vec4f,
- mlac_vec2f, mlac_vec3f, mlac_vec4f,
- mulc_vec2f, mulc_vec3f, mulc_vec4f,
- normalize_vec2f, normalize_vec3f, normalize_vec4f,
- rsbc_vec2f, rsbc_vec3f, rsbc_vec4f,
- setc_vec2f, setc_vec3f, setc_vec4f,
- subc_vec2f, subc_vec3f, subc_vec4f,
- sub_vec2f, sub_vec3f, sub_vec4f,
- vdiv_vec2f, vdiv_vec3f, vdiv_vec4f,
- vmla_vec2f, vmla_vec3f, vmla_vec4f,
- vmul_vec2f, vmul_vec3f, vmul_vec4f,
- cross_vec3f
-
- d) Matrix operations:
-
- addmat_2x2f, addmat_3x3f, addmat_4x4f,
- detmat_2x2f, detmat_3x3f, detmat_4x4f,
- divmat_2x2f, divmat_3x3f, divmat_4x4f,
- identitymat_2x2f, identitymat_3x3f, identitymat_4x4f,
- invmat_2x2f, invmat_3x3f, invmat_4x4f,
- mulcmatvec_2x2f, mulcmatvec_3x3f, mulcmatvec_4x4f,
- mulmat_2x2f, mulmat_3x3f, mulmat_4x4f,
- multrans_mat2x2f, multrans_mat3x3f, multrans_mat4x4f,
- setmat_2x2f, setmat_3x3f, setmat_4x4f,
- submat_2x2f, submat_3x3f, submat_4x4f,
- transmat_2x2f, transmat_3x3f, transmat_4x4f,
-
-2-c. Test cases and results
----------------------------
-The provided functions are categorized according to the operations that they
-perform. Functions in each of these categories accept different types of input
-data. Each set is accompanied with a unit test. These unit tests are provided
-as part of this library and can be used to verify and benchmark these functions
-on a target platform.
-
-===============
-3. Installation
-===============
-
-3-a. Requirements
------------------
-This release has been built and tested on the following host environments:
-
- 1) ARM Versatile Express / Linux linaro 2.6.38-1003
- 2) BeagleBoard RevC / Linux linaro-developer 3.1.0-4
- 3) Android AOSP Emulator / Android Open Source Project Toolchain
-
-
-The source code has been successfully built with the following toolchains:
-
- 1) Linaro GCC v4.6.1 ( https://launchpad.net/gcc-linaro/4.6 )
- 2) Prebuilt GCC toolchain provided with ICS release of ASOP
-
-
-3-b. Native Building
---------------------
-
-Native building (building directly on an ARM platform) is supported via
-
- make
-
-This will build a libne10.a and libne10.so in the local directory along with
-some test binaries.
-
- ./nightly.pl
-
-Will build and run a set of tests
-
-3-c. Android Building
-
-To build as part of the Android Open Source Project, copy the release
-directory into 'external' within the source directories and build as
-normal. This will install the libne10.so library into system/lib on the
-final Android OS image, where other applications will be able to access it in
-a similar way to other shared libraries. You will need to build with
-TARGET_ARCH_VARIANT=armv7-a-neon defined to enable NEON support.
-
-3-d. Alternative Approach
--------------------------
-While not supported, the functions within this library can be taken and
-incorporated (licensing conflicts permitting) within other projects as is.
-Details of how to do this are too project specific to detail here.
-
-============
-4. Changelog
-============
-
-4-a. r1.0_beta
-
- * Updated AOSP Makefile, cleaned native Makefile
- * Adding new files to the AOSP build
- * Made the default makefile a little more readable
- * New functions: Matrix transpose and identity matrix routines.
- * New functions: Matrix inversion routines.
- * New functions: Matrix determinant routines.
- * New functions: Matrix-vector multiplication routines.
- * New functions: Matrix multiplication routines.
- * New functions: Matrix addition and subtraction.
- * New functions: Cross product routine.
- * New functions: Dot product routines.
- * New functions: Vectorized mla routines.
- * New functions: Vectorized division routines.
- * New functions: Vectorized abs routine.
- * New functions: Vector-sub routines.
- * New functions: Vector-add routines.
- * Added the disclaimer:
- Each function is implemented in C, ARM Assembly and NEON code as a
- basis for comparison. Assembly versions, while efficient, are not
- intended as best-practice examples.
- * Added CMake to implement cross-platform build system
- * Added support for C++
+++ /dev/null
- /*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#Contents
-
-#NE10 Usage
-##Using NE10
-
-#NE10 Usage
-
-This file explains use of NE10 library.
-
-#Contents
-##Using NE10
-###General Notes
-###C Bindings
-###Future Bindings
-
-#Using NE10
-
-NE10 is implemented in a mix of C, intrinsics and assembler, however all
-functions are exposed as C. It can be used as a shared or static library and
-individual functions can be safely excluded from a build to reduce final
-binary size.
-
-##General Notes
-
-The type checking is relaxed, to enable compatiblity with any pre-existing or
-prevailing system of types a project might have. The debug version of the
-library will check the ranges passed in a call conform to the API limitations.
-The production version avoids these checks for performance reasons.
-
-It is assumed that the ranges of input arrays to be processed do not overlap.
-Clean handling of overlapping arrays is not designed for or tested. It is
-possible for source and destination pointers to be the same, or for you to pass
-in pointers inside the same array *as long as the regions indicated by
-pointer+length do not overlap*. Incorrect usage will typically result in an assert
-in debug builds and variable and inaccurate results in production builds.
-
-##C Bindings
-
-The C bindings (available in inc/NE10.h) aim for a balance between simple to
-use and efficient from a execution perspective. They are intended to be usable
-in C and C++ code, or in theory, in any other language with a well constructed
-mechanism for calling out to C code.
-
-The calls themselves are listed in inc/NE10.h, however depending on your
-circumstances - for example knowing that you are only going to be executing
-code on platforms with NEON available, then you could use the inc/NE10_neon.h
-include file and access those functions directly.
-
-Usage of all the functions is generally consistent, and function specific
-differences documented in the header, but here is an example as a taste:
-
- arm_vec3f_t *destination;
- arm_vec3f_t *source1;
- arm_vec3f_t *source2;
- int feedback;
-
- /* Fill your arrays with interesting vector data.. */
- ...
-
- /* Normalize the vectors in source1, returning the result in place */
- feedback = normalize_vec3f(source1, source1);
- if (feedback = <check error code>) {
- printf("Bad Thing happened normalizing!\n");
- }
- /* Multiply source1 by source2, returning the result in destination */
- feedback = mul_vec3f(destination, source1, source2);
- if (feedback = <check error code>) {
- printf("Bad Thing happened multiplying!\n");
- }
-
-While the functions all return an integer value to indicate success or failure,
-in practice almost none of the functions currently implemented can 'fail' in that
-way, however future functions may. This is to allow for a more consistent interface
-across the API in the future.
-
-##Future Bindings
-
-We hope to to add C++ bindings at a later date, based on feedback on the most
-appropriate way to provide that sort of API. Other languages will be
-considered, however the priority will be to improve the scope and performance
-of functions provided under the existing bindings.
-
-
--- /dev/null
+LOCAL_PATH:= $(call my-dir)
+
+ne10_neon_source := \
+ modules/math/NE10_abs.neon.s \
+ modules/math/NE10_addc.neon.c \
+ modules/math/NE10_addmat.neon.c \
+ modules/math/NE10_add.neon.s \
+ modules/math/NE10_cross.neon.s \
+ modules/math/NE10_detmat.neon.s \
+ modules/math/NE10_divc.neon.c \
+ modules/math/NE10_div.neon.s \
+ modules/math/NE10_dot.neon.s \
+ modules/math/NE10_identitymat.neon.s \
+ modules/math/NE10_invmat.neon.s \
+ modules/math/NE10_len.neon.s \
+ modules/math/NE10_mla.neon.s \
+ modules/math/NE10_mlac.neon.c \
+ modules/math/NE10_mulcmatvec.neon.s \
+ modules/math/NE10_mulc.neon.c \
+ modules/math/NE10_mulmat.neon.s \
+ modules/math/NE10_mul.neon.c \
+ modules/math/NE10_normalize.neon.s \
+ modules/math/NE10_rsbc.neon.c \
+ modules/math/NE10_setc.neon.c \
+ modules/math/NE10_subc.neon.c \
+ modules/math/NE10_submat.neon.c \
+ modules/math/NE10_sub.neon.s \
+ modules/math/NE10_transmat.neon.s \
+
+ne10_source_files := \
+ modules/math/NE10_abs.asm.s \
+ modules/math/NE10_addc.asm.s \
+ modules/math/NE10_addmat.asm.s \
+ modules/math/NE10_add.asm.s \
+ modules/math/NE10_cross.asm.s \
+ modules/math/NE10_detmat.asm.s \
+ modules/math/NE10_divc.asm.s \
+ modules/math/NE10_div.asm.s \
+ modules/math/NE10_dot.asm.s \
+ modules/math/NE10_identitymat.asm.s \
+ modules/math/NE10_invmat.asm.s \
+ modules/math/NE10_len.asm.s \
+ modules/math/NE10_mla.asm.s \
+ modules/math/NE10_mlac.asm.s \
+ modules/math/NE10_mulcmatvec.asm.s \
+ modules/math/NE10_mulc.asm.s \
+ modules/math/NE10_mulmat.asm.s \
+ modules/math/NE10_mul.asm.s \
+ modules/math/NE10_normalize.asm.s \
+ modules/math/NE10_rsbc.asm.s \
+ modules/math/NE10_setc.asm.s \
+ modules/math/NE10_subc.asm.s \
+ modules/math/NE10_submat.asm.s \
+ modules/math/NE10_sub.asm.s \
+ modules/math/NE10_transmat.asm.s \
+ modules/math/NE10_abs.c \
+ modules/math/NE10_addc.c \
+ modules/math/NE10_addmat.c \
+ modules/math/NE10_add.c \
+ modules/math/NE10_cross.c \
+ modules/math/NE10_detmat.c \
+ modules/math/NE10_divc.c \
+ modules/math/NE10_div.c \
+ modules/math/NE10_dot.c \
+ modules/math/NE10_identitymat.c \
+ modules/math/NE10_invmat.c \
+ modules/math/NE10_len.c \
+ modules/math/NE10_mla.c \
+ modules/math/NE10_mlac.c \
+ modules/math/NE10_mulcmatvec.c \
+ modules/math/NE10_mulc.c \
+ modules/math/NE10_mulmat.c \
+ modules/math/NE10_mul.c \
+ modules/math/NE10_normalize.c \
+ modules/math/NE10_rsbc.c \
+ modules/math/NE10_setc.c \
+ modules/math/NE10_subc.c \
+ modules/math/NE10_submat.c \
+ modules/math/NE10_sub.c \
+ modules/math/NE10_transmat.c \
+
+include $(CLEAR_VARS)
+
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/common/ \
+ $(LOCAL_PATH)/inc
+
+LOCAL_SRC_FILES := \
+ $(ne10_source_files)
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += $(ne10_neon_source)
+endif
+
+LOCAL_CFLAGS := -D_ARM_ASSEM_
+
+LOCAL_ARM_MODE := arm
+
+LOCAL_MODULE_TAGS := eng
+LOCAL_MODULE := libne10
+
+include $(BUILD_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_CFLAGS := -D_ARM_ASSEM_
+
+LOCAL_ARM_MODE := arm
+
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/common/ \
+ $(LOCAL_PATH)/inc
+
+LOCAL_SRC_FILES := \
+ $(ne10_source_files)
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += $(ne10_neon_source)
+endif
+
+LOCAL_MODULE_TAGS := eng
+LOCAL_MODULE := libne10
+
+include $(BUILD_SHARED_LIBRARY)
+
+
+
+++ /dev/null
-#!/bin/sh
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# NE10 Library : cleanall.sh
-#
-
-PRODUCT_NAME=NE10
-
-rm *.ex *.a *.o *.so
-rm res_*.txt
-rm .*.swp
-rm .exp.tmp
-rm testlog.txt
-for dir in `find * -maxdepth 0 -type d -name "${PRODUCT_NAME}_*"`; do rm -rf $dir; done;
-rm -rf ./java
-for fl in `find * -maxdepth 0 -type f -name "${PRODUCT_NAME}_*.tgz"`; do rm -rf $fl; done;
-if [ "$CLS" != "0" ]; then
- clear
- echo
- ls -la --color=auto
- echo
-fi
-echo
-
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : common/NE10header.s
+@
+
+.include "versionheader.s"
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ constant values that are used across the library
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .equ NE10_OK, 0
+ .equ NE10_ERR, -1
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : common/factor.h
+ */
+
+// Typebuilding MACROs
+// - Slight difference between toolchain versions on intrinsics
+#define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
+ {{ \
+ {x1, y1}, {x2,y2}, {x3,y3} \
+ }}
+
+// Unit test use this macro to index into their function table
+// "opc" stands for operation's code (which function),
+// and "imp" stands for implementation (which implementation of the function)
+#define FTBL_IDX(opc, imp) ((opc-1)*IMPL_COUNT+(imp-1))
+
+// This macro helps measure the performance of the code passed to it through the "code" argument
+// It is used in the unit tests
+#define MEASURE(res, code) \
+ { \
+ gettimeofday (&before, &zone); \
+ code \
+ gettimeofday (&after, &zone); \
+ if (before.tv_usec > after.tv_usec) \
+ { \
+ after.tv_usec += 1000000; \
+ after.tv_sec--; \
+ } \
+ lapsed.tv_usec = after.tv_usec - before.tv_usec; \
+ lapsed.tv_sec = after.tv_sec - before.tv_sec; \
+ res = lapsed.tv_sec + ((double)lapsed.tv_usec / 1000000.0); \
+ }
+
+// There are several categories of functions that share common code:
+
+// Different groups of functions take different number of inputs
+//
+// Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
+// Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
+// Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
+//
+// Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
+// Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
+// Group 6 = These take a dst, and a src ("DstSrc")
+//
+
+// The naming convention used in the following macros is as follows:
+// SNAPP_<A>_OPERATION_<T>_<I>
+// where
+// <A> Stands for the title of the operation (add, mul, etc) followed by its type (C = const as in addc).
+// The letter X - if used - means any such operation.
+// <T> Indicates the type of the operation (float, vec2, etc.)
+// The letter X - is used - means any type.
+// <I> This indicates the implementation (it can be C, ASM, or NEON).
+
+// A few macros to check pointers and their address range to make sure there's
+// no unwanted overlap between any two of them
+#define NE10_CHECKPOINTER_DstSrcCst_OPERATION \
+ if ( dst < src ) \
+ { assert ( dst + count <= src ); } \
+ else if ( dst > src ) \
+ { assert ( src + count <= dst ); }
+
+#define NE10_CHECKPOINTER_DstSrc_OPERATION NE10_CHECKPOINTER_DstSrcCst_OPERATION
+
+#define NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
+ if ( arg1 < arg2 ) \
+ { assert ( arg1 + count <= arg2 ); } \
+ else if ( arg1 > arg2 ) \
+ { assert ( arg2 + count <= arg1 ); } \
+ if ( arg1 < arg3 ) \
+ { assert ( arg1 + count <= arg3 ); } \
+ else if ( arg1 > arg3 ) \
+ { assert ( arg3 + count <= arg1 ); } \
+ if ( arg3 < arg2 ) \
+ { assert ( arg3 + count <= arg2 ); } \
+ else if ( arg3 > arg2 ) \
+ { assert ( arg2 + count <= arg3 ); }
+
+#define NE10_CHECKPOINTER_4POINTER_OPERATION(arg1, arg2, arg3, arg4) \
+ NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
+ if ( arg1 < arg4 ) \
+ { assert ( arg1 + count <= arg4 ); } \
+ else if ( arg1 > arg4 ) \
+ { assert ( arg4 + count <= arg1 ); } \
+ if ( arg2 < arg4 ) \
+ { assert ( arg2 + count <= arg4 ); } \
+ else if ( arg2 > arg4 ) \
+ { assert ( arg4 + count <= arg2 ); } \
+ if ( arg4 < arg3 ) \
+ { assert ( arg4 + count <= arg3 ); } \
+ else if ( arg4 > arg3 ) \
+ { assert ( arg3 + count <= arg4 ); }
+
+
+
+#define NE10_CHECKPOINTER_DstAccSrcCst_OPERATION { \
+ NE10_CHECKPOINTER_3POINTER_OPERATION(dst, acc, src); }
+
+#define NE10_CHECKPOINTER_DstCst_OPERATION {}
+
+#define NE10_CHECKPOINTER_DstSrc1Src2_OPERATION { \
+ NE10_CHECKPOINTER_3POINTER_OPERATION(dst, src1, src2); }
+
+#define NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION { \
+ NE10_CHECKPOINTER_4POINTER_OPERATION(dst, acc, src1, src2); }
+
+// These macros generalise implementation of the functions.
+
+// Macros used in C implementations
+#define NE10_TEMPLATE_XC_OPERATION_X_C(checkPointer, loopCode) { \
+ ne10_result_t res = NE10_OK; \
+ unsigned int itr = 0; \
+ checkPointer; \
+ for ( itr = 0; itr < count; itr++ ) \
+ { loopCode ; /* this loop iterates through each and every float item one at a time */ \
+ } \
+ return res; \
+ }
+
+// macros used in the NEON implementations
+
+// Main Loop = The loop where the number of items to be processed is exactly the
+// number that we can process in a single iteration.
+//
+// Secondary Loop = The loop that follows a Main Loop to fill in the entries that
+// did not fit into the Main Loop. This is needed when the number of
+// input items is not a multiple of the number of items that we
+// process in every iteration of the Main Loop.
+
+
+/****************************************************
+ * *
+ * The "DstSrcCst" group of functions *
+ * *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
+ /* load 4 values */ \
+ n_src = vld1q_f32( (float32_t*)src ); \
+ src += 4; /* move to the next 4 float items; 4*float */ \
+ loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
+ dst += 4; /* move to the next items; 4*float */ \
+ }
+
+#define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
+ float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+ float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
+ n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d0 */ \
+ loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+ vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+ /* move to the next item in the stream */ \
+ src++; \
+ dst++; \
+ }
+
+#define NE10_DstSrcCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_src; \
+ float32x4_t n_dst; \
+ checkPointer; \
+ int dif = 0; \
+ dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
+ for (; count > dif; count -= 4) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ unsigned int idx; \
+ for ( idx = 0 ; idx < dif; idx++ ) { \
+ loopCode2; \
+ } \
+ } \
+ return res; \
+ }
+
+///// - VEC2F - /////
+
+#define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
+ n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
+ src += 2; /* move to the next two vectors */ \
+ loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
+ dst += 2; /* move to the next 2 vectors */ \
+ }
+
+#define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
+ float32x2_t n_tmp_src; \
+ float32x2_t n_tmp_cst = { cst->x, cst->y }; \
+ n_tmp_src = vld1_f32( (float32_t*)src ); \
+ loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+ vst1_f32( (float32_t*)dst, n_tmp_src); \
+ }
+
+#define NE10_DstSrcCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
+ float32x4_t n_src; \
+ float32x4_t n_dst; \
+ checkPointer; \
+ int dif = count % 2; \
+ for (; count > dif; count -= 2) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ loopCode2; \
+ } \
+ return res; \
+ }
+
+///// - VEC3F - /////
+
+#define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
+ n_src1 = vld1q_f32( (float32_t*)src ); \
+ src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
+ n_src2 = vld1q_f32( (float32_t*)src ); \
+ src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
+ n_src3 = vld1q_f32( (float32_t*)src ); \
+ src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
+ loopCode; /* The main loop iterates through three 3D vectors each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ }
+
+#define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
+ float32x2x3_t n_tmp_src = FLOAT32_2x3( \
+ 0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
+ float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
+ cst->x, 0, cst->y, 0, cst->z, 0); \
+ n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
+ loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
+ vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
+ src++; \
+ dst++; \
+ }
+
+#define NE10_DstSrcCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
+ float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
+ float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
+ float32x4_t n_src1, n_src2, n_src3; \
+ float32x4_t n_dst1, n_dst2, n_dst3; \
+ checkPointer; \
+ int dif = count % 4; \
+ for (; count > dif; count -= 4) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ unsigned int idx; \
+ for ( idx = 0 ; idx < dif; idx++ ) { \
+ loopCode2; \
+ } \
+ } \
+ return res; \
+ }
+
+///// - VEC4F - /////
+
+/* Note that for the VEC4* types, we do not need a second loop as the number
+ of input items is always a multiple of four. */
+
+#define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
+ n_src = vld1q_f32( (float32_t*)src ); \
+ src ++; \
+ loopCode; \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
+ dst ++; \
+ }
+
+#define NE10_DstSrcCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
+ float32x4_t n_src; \
+ float32x4_t n_dst; \
+ checkPointer; \
+ for (; count != 0; count --) { \
+ loopCode; \
+ } \
+ return res; \
+ }
+
+/****************************************************
+ * *
+ * The "DstAccSrcCst" group of functions *
+ * *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
+ /* load 4 values */ \
+ n_acc = vld1q_f32( (float32_t*)acc ); \
+ n_src = vld1q_f32( (float32_t*)src ); \
+ acc += 4; /* move to the next 4 float items; 4*float */ \
+ src += 4; \
+ loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
+ dst += 4; /* move to the next items; 4*float */ \
+ }
+
+#define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
+ float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+ float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+ float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
+ n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
+ n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d1 */ \
+ loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+ vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+ /* move to the next item in the stream */ \
+ acc++; \
+ src++; \
+ dst++; \
+ }
+
+#define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
+
+///// - VEC2F - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
+ n_acc = vld1q_f32( (float32_t*)acc ); /* load two vectors */ \
+ n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
+ acc += 2; /* move to the next two vectors */ \
+ src += 2; \
+ loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
+ dst += 2; /* move to the next 2 vectors */ \
+ }
+
+#define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
+ float32x2_t n_tmp_acc; \
+ float32x2_t n_tmp_src; \
+ float32x2_t n_tmp_cst = { cst->x, cst->y }; \
+ n_tmp_acc = vld1_f32( (float32_t*)acc ); \
+ n_tmp_src = vld1_f32( (float32_t*)src ); \
+ loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+ vst1_f32( (float32_t*)dst, n_tmp_src); \
+ }
+
+#define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
+
+///// - VEC3F - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
+ n_acc1 = vld1q_f32( (float32_t*)acc ); /* Load accumulator values */ \
+ acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
+ n_acc2 = vld1q_f32( (float32_t*)acc ); \
+ acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
+ n_acc3 = vld1q_f32( (float32_t*)acc ); \
+ acc = ((void*)acc)+(4*sizeof(ne10_float32_t)); \
+ n_src1 = vld1q_f32( (float32_t*)src ); /* Load source values */ \
+ src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
+ n_src2 = vld1q_f32( (float32_t*)src ); \
+ src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
+ n_src3 = vld1q_f32( (float32_t*)src ); \
+ src = ((void*)src)+(4*sizeof(ne10_float32_t)); \
+ loopCode; /* The main loop iterates through three 3D vectors each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst1 ); /* Store the results back into the memory */ \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ }
+
+#define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
+ float32x2x3_t n_tmp_acc = FLOAT32_2x3( \
+ 0.0f, 0.0f, \
+ 0.0f, 0.0f, \
+ 0.0f, 0.0f \
+ ); \
+ float32x2x3_t n_tmp_src = FLOAT32_2x3( \
+ 0.0f, 0.0f, \
+ 0.0f, 0.0f, \
+ 0.0f, 0.0f \
+ ); \
+ float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
+ cst->x, 0, \
+ cst->y, 0, \
+ cst->z, 0 \
+ ); \
+ n_tmp_acc = vld3_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); \
+ n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
+ loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
+ vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
+ acc++; \
+ src++; \
+ dst++; \
+ }
+
+#define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
+
+///// - VEC4F - /////
+
+#define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
+ n_acc = vld1q_f32( (float32_t*)acc ); \
+ n_src = vld1q_f32( (float32_t*)src ); \
+ acc ++; \
+ src ++; \
+ loopCode; \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
+ dst ++; \
+ }
+
+#define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
+
+/****************************************************
+ * *
+ * The "DstCst" group of functions *
+ * *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
+ /* load 4 values */ \
+ loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_cst ); /* store theresults back */ \
+ dst += 4; /* move to the next items; 4*float */ \
+ }
+
+#define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
+ float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
+ loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+ vst1_lane_f32( (float32_t*)dst, n_tmp_cst, 0); /* store the lane back into the memory */ \
+ /* move to the next item in the stream */ \
+ dst++; \
+ }
+
+#define NE10_DstCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ checkPointer; \
+ int dif = 0; \
+ dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
+ for (; count > dif; count -= 4) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ unsigned int idx; \
+ for ( idx = 0 ; idx < dif; idx++ ) { \
+ loopCode2; \
+ } \
+ } \
+ return res; \
+ }
+
+///// - VEC2F - /////
+
+
+#define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
+ loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_cst ); /* store back */ \
+ dst += 2; /* move to the next 2 vectors */ \
+ }
+
+#define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
+ float32x2_t n_tmp_cst = { cst->x, cst->y }; \
+ loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+ vst1_f32( (float32_t*)dst, n_tmp_cst); \
+ }
+
+#define NE10_DstCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
+ checkPointer; \
+ int dif = count % 2; \
+ for (; count > dif; count -= 2) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ loopCode2; \
+ } \
+ return res; \
+ }
+
+///// - VEC3F - /////
+
+#define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
+ loopCode; /* The main loop iterates through three 3D vectors each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
+ dst = ((void*)dst)+(4*sizeof(ne10_float32_t)); \
+ }
+
+#define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
+ float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
+ cst->x, 0, \
+ cst->y, 0, \
+ cst->z, 0 \
+ ); \
+ loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
+ vst3_lane_f32( (float32_t*)dst, n_tmp_cst, 0); \
+ dst++; \
+ }
+
+#define NE10_DstCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
+ float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
+ float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
+ checkPointer; \
+ int dif = count % 4; \
+ for (; count > dif; count -= 4) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ unsigned int idx; \
+ for ( idx = 0 ; idx < dif; idx++ ) { \
+ loopCode2; \
+ } \
+ } \
+ return res; \
+ }
+
+///// - VEC4F - /////
+
+#define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
+ loopCode; \
+ vst1q_f32 ( (float32_t*)dst , n_cst ); /* The main loop iterates through one 4D vector each time */ \
+ dst ++; \
+ }
+
+#define NE10_DstCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
+ checkPointer; \
+ for (; count != 0; count --) { \
+ loopCode; \
+ } \
+ return res; \
+ }
+
+/****************************************************
+ * *
+ * The "DstSrc1Src2" group of functions *
+ * *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
+ /* load 4 values */ \
+ n_src = vld1q_f32( (float32_t*)src1 ); \
+ src1 += 4; /* move to the next 4 float items; 4*float */ \
+ n_src2 = vld1q_f32( (float32_t*)src2 ); \
+ src2 += 4; /* move to the next 4 float items; 4*float */ \
+ loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
+ dst += 4; /* move to the next items; 4*float */ \
+ }
+
+#define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
+ float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+ float32x2_t n_tmp_src2 = { 0.0f , 0.0f }; \
+ n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d0 */ \
+ n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src, 0); \
+ loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+ vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+ /* move to the next item in the stream */ \
+ src1++; \
+ src2++; \
+ dst++; \
+ }
+
+#define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
+
+/****************************************************
+ * *
+ * The "DstAccSrc1Src2" group of functions *
+ * *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
+ /* load 4 values */ \
+ n_acc = vld1q_f32( (float32_t*)acc ); \
+ n_src = vld1q_f32( (float32_t*)src1 ); \
+ n_src2 = vld1q_f32( (float32_t*)src2 ); \
+ acc += 4; /* move to the next 4 float items; 4*float */ \
+ src1 += 4; \
+ src2 += 4; \
+ loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
+ vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
+ dst += 4; /* move to the next items; 4*float */ \
+ }
+
+#define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
+ float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
+ float32x2_t n_tmp_src = { 0.0f , 0.0f }; \
+ float32x2_t n_tmp_src2 = { 0.0f, 0.0f }; \
+ n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
+ n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d1 */ \
+ n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src2, 0); /* load into the first lane of d2 */ \
+ loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
+ vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
+ /* move to the next item in the stream */ \
+ acc++; \
+ src1++; \
+ src2++; \
+ dst++; \
+ }
+
+#define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
+
+/****************************************************
+ * *
+ * The "DstSrc" group of functions *
+ * *
+ ****************************************************/
+
+///// - FLOAT - /////
+
+#define NE10_DstSrc_MAINLOOP_FLOAT_NEON NE10_DstSrcCst_MAINLOOP_FLOAT_NEON
+
+#define NE10_DstSrc_SECONDLOOP_FLOAT_NEON NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON
+
+#define NE10_DstSrc_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
+
+///// - VEC2F - /////
+
+#define NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode) { \
+ n_src = vld2_f32( (float32_t*)src ); /* load two vectors */ \
+ src += 2; /* move to the next two vectors */ \
+ loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
+ /* store the results and increment the destination pointer within the loopCode */ \
+ }
+
+#define NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode) { \
+ loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
+ /* store the results within the loopCode */ \
+ }
+
+#define NE10_DstSrc_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x2x2_t n_src; \
+ float32x2_t n_dst; \
+ checkPointer; \
+ int dif = count % 2; \
+ for (; count > dif; count -= 2) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ loopCode2; \
+ } \
+ return res; \
+ }
+
+///// - VEC3F - /////
+
+#define NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode) { \
+ n_src = vld3q_f32( (float32_t*)src ); \
+ src = ((void*)src)+(12*sizeof(ne10_float32_t)); \
+ loopCode; /* The main loop iterates through four 3D vectors each time */ \
+ /* store the results and increment the destination pointer within the loopCode */ \
+ }
+
+#define NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode) { \
+ loopCode; /* exceptional cases where the count isn't a multiple of 4 */ \
+ /* store the results within the loopCode */ \
+ }
+
+#define NE10_DstSrc_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4x3_t n_src; \
+ float32x4_t n_dst; \
+ checkPointer; \
+ int dif = count % 4; \
+ for (; count > dif; count -= 4) { \
+ loopCode1; \
+ } \
+ if ( 0 != dif ) { \
+ unsigned int idx; \
+ for ( idx = 0 ; idx < dif; idx++ ) { \
+ loopCode2; \
+ } \
+ } \
+ return res; \
+ }
+
+///// - VEC4F - /////
+
+/* Note that for the VEC4* types, we do not need a second loop as the number
+ of input items is always a multiple of four. */
+
+#define NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) { \
+ n_src = vld1q_f32( (float32_t*)src ); \
+ src ++; \
+ loopCode; \
+ /* store the results and increment the destination pointer within the loopCode */ \
+ }
+
+#define NE10_DstSrc_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
+ ne10_result_t res = NE10_OK; \
+ float32x4_t n_src; \
+ checkPointer; \
+ for (; count != 0; count --) { \
+ loopCode; \
+ } \
+ return res; \
+ }
+
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : common/macros.h
+ */
+
+#include "factor.h"
+
+// Macros used in actual implementations
+
+///// The "DstSrcCst" group of functions - FLOAT /////
+
+#define NE10_XC_OPERATION_X_C(loopCode) { \
+ NE10_TEMPLATE_XC_OPERATION_X_C( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+ loopCode); \
+ }
+
+#define NE10_XC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_cst = { cst, cst, cst, cst }; \
+ NE10_DstSrcCst_OPERATION_FLOAT_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+ NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
+ NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_XC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+ NE10_DstSrcCst_OPERATION_VEC2F_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+ NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
+ NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
+ ); \
+ }
+
+/* This macro uses interleaving to boost the performance */
+#define NE10_XC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+ NE10_DstSrcCst_OPERATION_VEC3F_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+ NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
+ NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_XC_OPERATION_VEC4F_NEON(loopCode) { \
+ NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
+ NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
+ ); \
+ }
+
+///// The "DstAccSrcCst" group of functions - FLOAT //////
+
+#define NE10_MLAC_OPERATION_X_C(loopCode) { \
+ NE10_TEMPLATE_XC_OPERATION_X_C( \
+ NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+ loopCode); \
+ }
+
+#define NE10_MLAC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_acc; \
+ float32x4_t n_cst = { cst, cst, cst, cst }; \
+ NE10_DstAccSrcCst_OPERATION_FLOAT_NEON( \
+ NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+ NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
+ NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_MLAC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_acc; \
+ NE10_DstAccSrcCst_OPERATION_VEC2F_NEON( \
+ NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+ NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
+ NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_MLAC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_acc1, n_acc2, n_acc3; \
+ NE10_DstAccSrcCst_OPERATION_VEC3F_NEON( \
+ NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+ NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
+ NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_MLAC_OPERATION_VEC4F_NEON(loopCode) { \
+ float32x4_t n_acc; \
+ NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
+ NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
+ NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
+ ); \
+ }
+
+///// The "DstCst" group of functions - FLOAT /////
+
+#define NE10_SETC_OPERATION_X_C(loopCode) { \
+ NE10_TEMPLATE_XC_OPERATION_X_C( \
+ NE10_CHECKPOINTER_DstCst_OPERATION; , \
+ loopCode); \
+ }
+
+#define NE10_SETC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_cst = { cst, cst, cst, cst }; \
+ NE10_DstCst_OPERATION_FLOAT_NEON( \
+ NE10_CHECKPOINTER_DstCst_OPERATION; , \
+ NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
+ NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_SETC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+ NE10_DstCst_OPERATION_VEC2F_NEON( \
+ NE10_CHECKPOINTER_DstCst_OPERATION; , \
+ NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
+ NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
+ ); \
+ }
+
+/* This macro uses interleaving to boost the performance */
+#define NE10_SETC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+ NE10_DstCst_OPERATION_VEC3F_NEON( \
+ NE10_CHECKPOINTER_DstCst_OPERATION; , \
+ NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
+ NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_SETC_OPERATION_VEC4F_NEON(loopCode) { \
+ NE10_DstCst_OPERATION_VEC4F_NEON( \
+ NE10_CHECKPOINTER_DstCst_OPERATION; , \
+ NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
+ ); \
+ }
+
+///// The "DstSrc1Src2" group of functions //////
+
+#define NE10_X_OPERATION_FLOAT_C(loopCode) { \
+ NE10_TEMPLATE_XC_OPERATION_X_C( \
+ NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
+ loopCode); \
+ }
+
+#define NE10_X_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_src2; \
+ NE10_DstSrc1Src2_OPERATION_FLOAT_NEON( \
+ NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
+ NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
+ NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_DOT_OPERATION_X_C NE10_X_OPERATION_FLOAT_C
+
+///// The "DstSrc" group of functions //////
+
+#define NE10_ABS_OPERATION_X_C(loopCode) { \
+ NE10_TEMPLATE_XC_OPERATION_X_C( \
+ NE10_CHECKPOINTER_DstSrc_OPERATION, \
+ loopCode); \
+ }
+
+#define NE10_ABS_OPERATION_FLOAT_C NE10_ABS_OPERATION_X_C
+
+#define NE10_ABS_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+ arm_float_t cst = 0.0f; /* this is used to compare the values against. */ \
+ float32x4_t n_cst = { cst, cst, cst, cst }; \
+ NE10_DstSrc_OPERATION_FLOAT_NEON( \
+ NE10_CHECKPOINTER_DstSrc_OPERATION; , \
+ NE10_DstSrc_MAINLOOP_FLOAT_NEON(loopCode1); , \
+ NE10_DstSrc_SECONDLOOP_FLOAT_NEON(loopCode2); \
+ ); \
+ }
+
+#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
+#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
+#define NE10_CMATVEC_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
+#define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
+ NE10_DstSrc_OPERATION_VEC2F_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
+ NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode1), \
+ NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode2) \
+ ); \
+ }
+
+#define NE10_LEN_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
+ NE10_DstSrc_OPERATION_VEC3F_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
+ NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode1), \
+ NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode2) \
+ ); \
+ }
+
+#define NE10_LEN_OPERATION_VEC4F_NEON(loopCode) { \
+ NE10_DstSrc_OPERATION_VEC4F_NEON( \
+ NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
+ NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) \
+ ); \
+ }
+
+#define NE10_DETMAT_OPERATION_X_C NE10_ABS_OPERATION_X_C
+
+///// The "DstAccSrc1Src2" group of functions //////
+
+#define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
+ float32x4_t n_acc; \
+ float32x4_t n_src2; \
+ NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON( \
+ NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION; , \
+ NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
+ NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
+ ); \
+ }
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : common/versionheader.h
+ */
+
+/////////////////////////////////////////////////////////
+// version information
+/////////////////////////////////////////////////////////
+
+#define VERSION_MAJOR 0
+#define VERSION_MINOR 9
+#define VERSION_REVISION 10
+
+#define PHASE 1
+#define COPYRIGHT_YEAR 2012
+#define COPYRIGHT_HOLDER "ARM Ltd."
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : common/versionheader.s
+@
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ version information
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ .equ VERSION_MAJOR, 0
+ .equ VERSION_MINOR, 9
+ .equ VERSION_REVISION, 10
+
+ .equ PHASE, 1
+ .equ COPYRIGHT_YEAR, 2012
+
+COPYRIGHT_HOLDER:
+ .asciz "ARM Ltd."
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : CMakeBuilding.txt
+ */
+
+=========================BUILDING METHOD=================================
+
+NE10 uses CMake to describe the build in a platform independent manner.
+
+First download and install cmake from cmake.org.
+In Ubuntu, you can install cmake by "sudo apt-get install cmake"
+
+---------------------------NATIVE-COMPILING------------------------------
+For Unix platforms, say the following on a terminal: (Replace $NE10PATH with the directory where this file is located.)
+ cd $NE10PATH
+ mkdir build && cd build
+ cmake ..
+ make
+Then the libNE10.a is placed in ./modules/ and a sample program "NE10_test_static" is placed in ./samples/. you can run it.
+You might want to add -DNE10_BUILD_SHARED=ON to the cmake call to generate the dynamic library and test program "NE10_test_dynamic".
+
+---------------------------CROSS-COMPILING------------------------------
+For cross-compiling, the process is in the following:
+ cd $NE10PATH
+
+Open the config.cmake and change the compiler toolchain to yourself.My toolchain is Linaro GCC 4.6.
+In Ubuntu 11.10 you can install it by "sudo apt-get install gcc-arm-linux-gnueabi".
+ set( CMAKE_C_COMPILER arm-linux-gnueabi-gcc )
+ set( CMAKE_CXX_COMPILER arm-linux-gnueabi-g++ )
+ set( CMAKE_ASM_COMPILER arm-linux-gnueabi-as )
+
+ find_program(CMAKE_AR NAMES "arm-linux-gnueabi-ar")
+ mark_as_advanced(CMAKE_AR)
+
+ find_program(CMAKE_RANLIB NAMES "arm-linux-gnueabi-ranlib")
+ mark_as_advanced(CMAKE_RANLIB)
+
+Now you can use the following commands to generate makefile.
+ mkdir build && cd build
+ cmake -DCMAKE_TOOLCHAIN_FILE=../config.cmake ..
+ make
+
+Then the libNE10.a is placed in ./modules/ and a sample program "NE10_test_static" is placed in ./samples/. you can copy these to the target and run it.
+You might want to add -DNE10_BUILD_SHARED=ON to the cmake call to generate the dynamic library and test program "NE10_test_dynamic".
+
+Note:
+When you run NE10_test_dynamic on the target, you might meet the error:
+ "NE10_test_dynamic: error while loading shared libraries: libNE10_shared.so.10: cannot open shared object file: No such file or directory"
+You can run the following command:
+ export LD_LIBRARY_PATH=$NE10PATH/build/modules
+
+--------------------------------END--------------------------------------
--- /dev/null
+ /*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+=========================CODE FORMATTER=================================
+
+NE10 uses astyle(http://astyle.sourceforge.net/) as code formatter.
+First download and install astyle from homw page.
+Then you can run the script in "tools" folder to indent the source code
+ ./cformatter.sh $PATH/file.c
+After checking the new file, you can remove the backup file.c.orig
+
--- /dev/null
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
--- /dev/null
+NE10 Library
+Copyright 2011-12 ARM Limited
+
+This product was produced by ARM Limited.
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : ReleaseNote.txt
+ */
+NE10 SIMD LIBRARY - Release Note
+LAST UPDATED ON: 10 / APR / 2012
+
+========
+Contents
+========
+
+ 1. Preface
+ 1-a. License
+ 1-b. Product status
+ 2. Release details
+ 2-a. Product release status
+ 2-b. Functions included
+ 2-c. Test cases and results
+ 3. Installation
+ 3-a. Requirements
+ 3-b. Alternative Approach
+ 4. Changelog
+ 4-a. r1.0_beta
+
+
+==========
+1. Preface
+==========
+
+1-a. License details
+--------------------
+NE10 is an open source project released under the Apache License,
+Version 2.0 (ALv2). See the file LICENSE for the full text of the ALv2.
+
+
+1-b. Product status
+-------------------
+This is the first publicly available version of NE10. This open source project
+is actively under development and more functions as well as improved versions of
+the available functions will be contributed to the source code.
+
+
+==================
+2. Release details
+==================
+
+2-a. Product release status
+---------------------------
+Version 1.0 beta
+
+The set of functions planned for this release are in place. However some issues
+remain where their intended behaviour diverges from the planned specification:
+
+In the release version, unless impractical and explicitly stated, all functions
+will operate correctly when the output area of the result is the same as one
+of the input areas. (ie. where the src1 or src2 parameter == the dst parameter)
+
+In this beta release that behaviour cannot be assumed.
+
+2-b. Functions included
+---------------------------
+NE10 is a software library that provides Linux and Android support for Single
+Instruction Multiple Data (SIMD) functionality. In this release, a number of
+mathematical functions (mainly vector and scalar operations) have been
+implemented for the ARM v7 instruction set architecture as well as ARM NEON
+SIMD architecture extensions.
+
+This library has been developed and tested on the following processors:
+
+ 1) ARM Cortex-A9 with NEON extension
+ 2) ARM Cortex-A8 with NEON extension
+
+The following is a list of currently available functions.
+
+ a) Vector-Constant Arithmetic
+
+ addc_float, addc_vec2f, addc_vec3f, addc_vec4f,
+ subc_float, subc_vec2f, subc_vec3f, subc_vec4f,
+ rsbc_float, rsbc_vec2f, rsbc_vec3f, rsbc_vec4f,
+ mulc_float, mulc_vec2f, mulc_vec3f, mulc_vec4f,
+ divc_float, divc_vec2f, divc_vec3f, divc_vec4f,
+ setc_float, setc_vec2f, setc_vec3f, setc_vec4f,
+ mlac_float, mlac_vec2f, mlac_vec3f, mlac_vec4f
+
+ b) Arithmetic functions over arrays of cst values:
+
+ add_float, sub_float, mul_float, div_float, mla_float, abs_float
+
+ c) Operations on Vectors:
+
+ abs_vec2f, abs_vec3f, abs_vec4f,
+ addc_vec2f, addc_vec3f, addc_vec4f,
+ add_vec2f, add_vec3f, add_vec4f,
+ divc_vec2f, divc_vec3f, divc_vec4f,
+ dot_vec2f, dot_vec3f, dot_vec4f
+ len_vec2f, len_vec3f, len_vec4f,
+ mlac_vec2f, mlac_vec3f, mlac_vec4f,
+ mulc_vec2f, mulc_vec3f, mulc_vec4f,
+ normalize_vec2f, normalize_vec3f, normalize_vec4f,
+ rsbc_vec2f, rsbc_vec3f, rsbc_vec4f,
+ setc_vec2f, setc_vec3f, setc_vec4f,
+ subc_vec2f, subc_vec3f, subc_vec4f,
+ sub_vec2f, sub_vec3f, sub_vec4f,
+ vdiv_vec2f, vdiv_vec3f, vdiv_vec4f,
+ vmla_vec2f, vmla_vec3f, vmla_vec4f,
+ vmul_vec2f, vmul_vec3f, vmul_vec4f,
+ cross_vec3f
+
+ d) Matrix operations:
+
+ addmat_2x2f, addmat_3x3f, addmat_4x4f,
+ detmat_2x2f, detmat_3x3f, detmat_4x4f,
+ divmat_2x2f, divmat_3x3f, divmat_4x4f,
+ identitymat_2x2f, identitymat_3x3f, identitymat_4x4f,
+ invmat_2x2f, invmat_3x3f, invmat_4x4f,
+ mulcmatvec_2x2f, mulcmatvec_3x3f, mulcmatvec_4x4f,
+ mulmat_2x2f, mulmat_3x3f, mulmat_4x4f,
+ multrans_mat2x2f, multrans_mat3x3f, multrans_mat4x4f,
+ setmat_2x2f, setmat_3x3f, setmat_4x4f,
+ submat_2x2f, submat_3x3f, submat_4x4f,
+ transmat_2x2f, transmat_3x3f, transmat_4x4f,
+
+2-c. Test cases and results
+---------------------------
+The provided functions are categorized according to the operations that they
+perform. Functions in each of these categories accept different types of input
+data. Each set is accompanied with a unit test. These unit tests are provided
+as part of this library and can be used to verify and benchmark these functions
+on a target platform.
+
+===============
+3. Installation
+===============
+
+3-a. Requirements
+-----------------
+This release has been built and tested on the following host environments:
+
+ 1) ARM Versatile Express / Linux linaro 2.6.38-1003
+ 2) BeagleBoard RevC / Linux linaro-developer 3.1.0-4
+ 3) Android AOSP Emulator / Android Open Source Project Toolchain
+
+
+The source code has been successfully built with the following toolchains:
+
+ 1) Linaro GCC v4.6.1 ( https://launchpad.net/gcc-linaro/4.6 )
+ 2) Prebuilt GCC toolchain provided with ICS release of ASOP
+
+
+3-b. Native Building
+--------------------
+
+Native building (building directly on an ARM platform) is supported via
+
+ make
+
+This will build a libne10.a and libne10.so in the local directory along with
+some test binaries.
+
+ ./nightly.pl
+
+Will build and run a set of tests
+
+3-c. Android Building
+
+To build as part of the Android Open Source Project, copy the release
+directory into 'external' within the source directories and build as
+normal. This will install the libne10.so library into system/lib on the
+final Android OS image, where other applications will be able to access it in
+a similar way to other shared libraries. You will need to build with
+TARGET_ARCH_VARIANT=armv7-a-neon defined to enable NEON support.
+
+3-d. Alternative Approach
+-------------------------
+While not supported, the functions within this library can be taken and
+incorporated (licensing conflicts permitting) within other projects as is.
+Details of how to do this are too project specific to detail here.
+
+============
+4. Changelog
+============
+
+4-a. r1.0_beta
+
+ * Updated AOSP Makefile, cleaned native Makefile
+ * Adding new files to the AOSP build
+ * Made the default makefile a little more readable
+ * New functions: Matrix transpose and identity matrix routines.
+ * New functions: Matrix inversion routines.
+ * New functions: Matrix determinant routines.
+ * New functions: Matrix-vector multiplication routines.
+ * New functions: Matrix multiplication routines.
+ * New functions: Matrix addition and subtraction.
+ * New functions: Cross product routine.
+ * New functions: Dot product routines.
+ * New functions: Vectorized mla routines.
+ * New functions: Vectorized division routines.
+ * New functions: Vectorized abs routine.
+ * New functions: Vector-sub routines.
+ * New functions: Vector-add routines.
+ * Added the disclaimer:
+ Each function is implemented in C, ARM Assembly and NEON code as a
+ basis for comparison. Assembly versions, while efficient, are not
+ intended as best-practice examples.
+ * Added CMake to implement cross-platform build system
+ * Added support for C++
--- /dev/null
+ /*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#Contents
+
+#NE10 Usage
+##Using NE10
+
+#NE10 Usage
+
+This file explains use of NE10 library.
+
+#Contents
+##Using NE10
+###General Notes
+###C Bindings
+###Future Bindings
+
+#Using NE10
+
+NE10 is implemented in a mix of C, intrinsics and assembler, however all
+functions are exposed as C. It can be used as a shared or static library and
+individual functions can be safely excluded from a build to reduce final
+binary size.
+
+##General Notes
+
+The type checking is relaxed, to enable compatiblity with any pre-existing or
+prevailing system of types a project might have. The debug version of the
+library will check the ranges passed in a call conform to the API limitations.
+The production version avoids these checks for performance reasons.
+
+It is assumed that the ranges of input arrays to be processed do not overlap.
+Clean handling of overlapping arrays is not designed for or tested. It is
+possible for source and destination pointers to be the same, or for you to pass
+in pointers inside the same array *as long as the regions indicated by
+pointer+length do not overlap*. Incorrect usage will typically result in an assert
+in debug builds and variable and inaccurate results in production builds.
+
+##C Bindings
+
+The C bindings (available in inc/NE10.h) aim for a balance between simple to
+use and efficient from a execution perspective. They are intended to be usable
+in C and C++ code, or in theory, in any other language with a well constructed
+mechanism for calling out to C code.
+
+The calls themselves are listed in inc/NE10.h, however depending on your
+circumstances - for example knowing that you are only going to be executing
+code on platforms with NEON available, then you could use the inc/NE10_neon.h
+include file and access those functions directly.
+
+Usage of all the functions is generally consistent, and function specific
+differences documented in the header, but here is an example as a taste:
+
+ arm_vec3f_t *destination;
+ arm_vec3f_t *source1;
+ arm_vec3f_t *source2;
+ int feedback;
+
+ /* Fill your arrays with interesting vector data.. */
+ ...
+
+ /* Normalize the vectors in source1, returning the result in place */
+ feedback = normalize_vec3f(source1, source1);
+ if (feedback = <check error code>) {
+ printf("Bad Thing happened normalizing!\n");
+ }
+ /* Multiply source1 by source2, returning the result in destination */
+ feedback = mul_vec3f(destination, source1, source2);
+ if (feedback = <check error code>) {
+ printf("Bad Thing happened multiplying!\n");
+ }
+
+While the functions all return an integer value to indicate success or failure,
+in practice almost none of the functions currently implemented can 'fail' in that
+way, however future functions may. This is to allow for a more consistent interface
+across the API in the future.
+
+##Future Bindings
+
+We hope to to add C++ bindings at a later date, based on feedback on the most
+appropriate way to provide that sort of API. Other languages will be
+considered, however the priority will be to improve the scope and performance
+of functions provided under the existing bindings.
+
+
--- /dev/null
+# Doxyfile 1.7.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME = NE10 SIMD LIBRARY
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF = SIMD FUNCTION LIBRARY FOR LINUX AND ANDROID
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO = NE10
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = ./docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even if there is only one candidate or it is obvious which candidate to choose by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = ./inc/
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS = NE10.h
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE = .git
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX = NO
+
+# This tag can be used to set the number of enum values (range [0,1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+# Note that a value of 0 will completely suppress the enum values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = YES
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, svg, gif or svg.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
+++ /dev/null
-# Doxyfile 1.7.3
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all
-# text before the first occurrence of this tag. Doxygen uses libiconv (or the
-# iconv built into libc) for the transcoding. See
-# http://www.gnu.org/software/libiconv for the list of possible encodings.
-
-DOXYFILE_ENCODING = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
-# by quotes) that should identify the project.
-
-PROJECT_NAME = NE10 SIMD LIBRARY
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number.
-# This could be handy for archiving the generated documentation or
-# if some version control system is used.
-
-PROJECT_NUMBER =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF = SIMD FUNCTION LIBRARY FOR LINUX AND ANDROID
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is
-# included in the documentation. The maximum height of the logo should not
-# exceed 55 pixels and the maximum width should not exceed 200 pixels.
-# Doxygen will copy the logo to the output directory.
-
-PROJECT_LOGO = NE10
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
-# base path where the generated documentation will be put.
-# If a relative path is entered, it will be relative to the location
-# where doxygen was started. If left blank the current directory will be used.
-
-OUTPUT_DIRECTORY = ./docs
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
-# 4096 sub-directories (in 2 levels) under the output directory of each output
-# format and will distribute the generated files over these directories.
-# Enabling this option can be useful when feeding doxygen a huge amount of
-# source files, where putting all generated files in the same directory would
-# otherwise cause performance problems for the file system.
-
-CREATE_SUBDIRS = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# The default language is English, other supported languages are:
-# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
-# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
-# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
-# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
-# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
-# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
-
-OUTPUT_LANGUAGE = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
-# include brief member descriptions after the members that are listed in
-# the file and class documentation (similar to JavaDoc).
-# Set to NO to disable this.
-
-BRIEF_MEMBER_DESC = YES
-
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
-# the brief description of a member or function before the detailed description.
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-
-REPEAT_BRIEF = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator
-# that is used to form the text in various listings. Each string
-# in this list, if found as the leading text of the brief description, will be
-# stripped from the text and the result after processing the whole list, is
-# used as the annotated text. Otherwise, the brief description is used as-is.
-# If left blank, the following values are used ("$name" is automatically
-# replaced with the name of the entity): "The $name class" "The $name widget"
-# "The $name file" "is" "provides" "specifies" "contains"
-# "represents" "a" "an" "the"
-
-ABBREVIATE_BRIEF =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# Doxygen will generate a detailed section even if there is only a brief
-# description.
-
-ALWAYS_DETAILED_SEC = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-
-INLINE_INHERITED_MEMB = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
-# path before files name in the file list and in the header files. If set
-# to NO the shortest path that makes the file name unique will be used.
-
-FULL_PATH_NAMES = YES
-
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
-# can be used to strip a user-defined part of the path. Stripping is
-# only done if one of the specified strings matches the left-hand part of
-# the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the
-# path to strip.
-
-STRIP_FROM_PATH =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
-# the path mentioned in the documentation of a class, which tells
-# the reader which header file to include in order to use a class.
-# If left blank only the name of the header file containing the class
-# definition is used. Otherwise one should specify the include paths that
-# are normally passed to the compiler using the -I flag.
-
-STRIP_FROM_INC_PATH =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
-# (but less readable) file names. This can be useful if your file system
-# doesn't support long names like on DOS, Mac, or CD-ROM.
-
-SHORT_NAMES = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
-# will interpret the first line (until the first dot) of a JavaDoc-style
-# comment as the brief description. If set to NO, the JavaDoc
-# comments will behave just like regular Qt-style comments
-# (thus requiring an explicit @brief command for a brief description.)
-
-JAVADOC_AUTOBRIEF = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
-# interpret the first line (until the first dot) of a Qt-style
-# comment as the brief description. If set to NO, the comments
-# will behave just like regular Qt-style comments (thus requiring
-# an explicit \brief command for a brief description.)
-
-QT_AUTOBRIEF = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
-# treat a multi-line C++ special comment block (i.e. a block of //! or ///
-# comments) as a brief description. This used to be the default behaviour.
-# The new default is to treat a multi-line C++ comment block as a detailed
-# description. Set this tag to YES if you prefer the old behaviour instead.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
-# member inherits the documentation from any documented member that it
-# re-implements.
-
-INHERIT_DOCS = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
-# a new page for each member. If set to NO, the documentation of a member will
-# be part of the file/class/namespace that contains it.
-
-SEPARATE_MEMBER_PAGES = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab.
-# Doxygen uses this value to replace tabs by spaces in code fragments.
-
-TAB_SIZE = 8
-
-# This tag can be used to specify a number of aliases that acts
-# as commands in the documentation. An alias has the form "name=value".
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to
-# put the command \sideeffect (or @sideeffect) in the documentation, which
-# will result in a user-defined paragraph with heading "Side Effects:".
-# You can put \n's in the value part of an alias to insert newlines.
-
-ALIASES =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
-# sources only. Doxygen will then generate output that is more tailored for C.
-# For instance, some of the names that are used will be different. The list
-# of all members will be omitted, etc.
-
-OPTIMIZE_OUTPUT_FOR_C = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
-# sources only. Doxygen will then generate output that is more tailored for
-# Java. For instance, namespaces will be presented as packages, qualified
-# scopes will look different, etc.
-
-OPTIMIZE_OUTPUT_JAVA = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources only. Doxygen will then generate output that is more tailored for
-# Fortran.
-
-OPTIMIZE_FOR_FORTRAN = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for
-# VHDL.
-
-OPTIMIZE_OUTPUT_VHDL = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given extension.
-# Doxygen has a built-in mapping, but you can override or extend it using this
-# tag. The format is ext=language, where ext is a file extension, and language
-# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
-# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
-# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
-# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
-# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
-
-EXTENSION_MAPPING =
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should
-# set this tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
-# func(std::string) {}). This also makes the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-
-BUILTIN_STL_SUPPORT = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-
-CPP_CLI_SUPPORT = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
-# Doxygen will parse them like normal C++ but will assume all classes use public
-# instead of private inheritance when no explicit protection keyword is present.
-
-SIP_SUPPORT = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate getter
-# and setter methods for a property. Setting this option to YES (the default)
-# will make doxygen replace the get and set methods by a property in the
-# documentation. This will only work if the methods are indeed getting or
-# setting a simple type. If this is not the case, or you want to show the
-# methods anyway, you should set this option to NO.
-
-IDL_PROPERTY_SUPPORT = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-
-DISTRIBUTE_GROUP_DOC = NO
-
-# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
-# the same type (for instance a group of public functions) to be put as a
-# subgroup of that type (e.g. under the Public Functions section). Set it to
-# NO to prevent subgrouping. Alternatively, this can be done per class using
-# the \nosubgrouping command.
-
-SUBGROUPING = YES
-
-# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
-# is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically
-# be useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-
-TYPEDEF_HIDES_STRUCT = NO
-
-# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
-# determine which symbols to keep in memory and which to flush to disk.
-# When the cache is full, less often used symbols will be written to disk.
-# For small to medium size projects (<1000 input files) the default value is
-# probably good enough. For larger projects a too small cache size can cause
-# doxygen to be busy swapping symbols to and from disk most of the time
-# causing a significant performance penalty.
-# If the system has enough physical memory increasing the cache will improve the
-# performance by keeping more symbols in memory. Note that the value works on
-# a logarithmic scale so increasing the size by one will roughly double the
-# memory usage. The cache size is given by this formula:
-# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
-# corresponding to a cache size of 2^16 = 65536 symbols
-
-SYMBOL_CACHE_SIZE = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available.
-# Private class members and static file members will be hidden unless
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
-
-EXTRACT_ALL = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
-# will be included in the documentation.
-
-EXTRACT_PRIVATE = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file
-# will be included in the documentation.
-
-EXTRACT_STATIC = NO
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
-# defined locally in source files will be included in the documentation.
-# If set to NO only classes defined in header files are included.
-
-EXTRACT_LOCAL_CLASSES = YES
-
-# This flag is only useful for Objective-C code. When set to YES local
-# methods, which are defined in the implementation section but not in
-# the interface are included in the documentation.
-# If set to NO (the default) only methods in the interface are included.
-
-EXTRACT_LOCAL_METHODS = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base
-# name of the file that contains the anonymous namespace. By default
-# anonymous namespaces are hidden.
-
-EXTRACT_ANON_NSPACES = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
-# undocumented members of documented classes, files or namespaces.
-# If set to NO (the default) these members will be included in the
-# various overviews, but no documentation section is generated.
-# This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_MEMBERS = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy.
-# If set to NO (the default) these classes will be included in the various
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_CLASSES = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
-# friend (class|struct|union) declarations.
-# If set to NO (the default) these declarations will be included in the
-# documentation.
-
-HIDE_FRIEND_COMPOUNDS = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
-# documentation blocks found inside the body of a function.
-# If set to NO (the default) these blocks will be appended to the
-# function's detailed documentation block.
-
-HIDE_IN_BODY_DOCS = NO
-
-# The INTERNAL_DOCS tag determines if documentation
-# that is typed after a \internal command is included. If the tag is set
-# to NO (the default) then the documentation will be excluded.
-# Set it to YES to include the internal documentation.
-
-INTERNAL_DOCS = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
-# file names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-
-CASE_SENSE_NAMES = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
-# will show members with their full class and namespace scopes in the
-# documentation. If set to YES the scope will be hidden.
-
-HIDE_SCOPE_NAMES = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
-# will put a list of the files that are included by a file in the documentation
-# of that file.
-
-SHOW_INCLUDE_FILES = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
-# will list include files with double quotes in the documentation
-# rather than with sharp brackets.
-
-FORCE_LOCAL_INCLUDES = NO
-
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
-# is inserted in the documentation for inline members.
-
-INLINE_INFO = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
-# will sort the (detailed) documentation of file and class members
-# alphabetically by member name. If set to NO the members will appear in
-# declaration order.
-
-SORT_MEMBER_DOCS = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
-# brief documentation of file, namespace and class members alphabetically
-# by member name. If set to NO (the default) the members will appear in
-# declaration order.
-
-SORT_BRIEF_DOCS = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
-# will sort the (brief and detailed) documentation of class members so that
-# constructors and destructors are listed first. If set to NO (the default)
-# the constructors will appear in the respective orders defined by
-# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
-# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
-# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
-# hierarchy of group names into alphabetical order. If set to NO (the default)
-# the group names will appear in their defined order.
-
-SORT_GROUP_NAMES = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
-# sorted by fully-qualified names, including namespaces. If set to
-# NO (the default), the class list will be sorted only by class name,
-# not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the
-# alphabetical list.
-
-SORT_BY_SCOPE_NAME = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper type resolution of all parameters of a function it will reject a
-# match between the prototype and the implementation of a member function even if there is only one candidate or it is obvious which candidate to choose by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
-# will still accept a match between prototype and implementation in such cases.
-
-STRICT_PROTO_MATCHING = NO
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or
-# disable (NO) the test list. This list is created by putting \test
-# commands in the documentation.
-
-GENERATE_TESTLIST = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or
-# disable (NO) the bug list. This list is created by putting \bug
-# commands in the documentation.
-
-GENERATE_BUGLIST = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
-# disable (NO) the deprecated list. This list is created by putting
-# \deprecated commands in the documentation.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional
-# documentation sections, marked by \if sectionname ... \endif.
-
-ENABLED_SECTIONS =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
-# the initial value of a variable or macro consists of for it to appear in
-# the documentation. If the initializer consists of more lines than specified
-# here it will be hidden. Use a value of 0 to hide initializers completely.
-# The appearance of the initializer of individual variables and macros in the
-# documentation can be controlled using \showinitializer or \hideinitializer
-# command in the documentation regardless of this setting.
-
-MAX_INITIALIZER_LINES = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
-# at the bottom of the documentation of classes and structs. If set to YES the
-# list will mention the files that were used to generate the documentation.
-
-SHOW_USED_FILES = YES
-
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES = NO
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
-# This will remove the Files entry from the Quick Index and from the
-# Folder Tree View (if specified). The default is YES.
-
-SHOW_FILES = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
-# Namespaces page.
-# This will remove the Namespaces entry from the Quick Index
-# and from the Folder Tree View (if specified). The default is YES.
-
-SHOW_NAMESPACES = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command <command> <input-file>, where <command> is the value of
-# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
-# provided by doxygen. Whatever the program writes to standard output
-# is used as the file version. See the manual for examples.
-
-FILE_VERSION_FILTER =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. The create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option.
-# You can optionally specify a file name after the option, if omitted
-# DoxygenLayout.xml will be used as the name of the layout file.
-
-LAYOUT_FILE =
-
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
-
-QUIET = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated by doxygen. Possible values are YES and NO. If left blank
-# NO is used.
-
-WARNINGS = YES
-
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
-# automatically be disabled.
-
-WARN_IF_UNDOCUMENTED = YES
-
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some
-# parameters in a documented function, or documenting parameters that
-# don't exist or using markup commands wrongly.
-
-WARN_IF_DOC_ERROR = YES
-
-# The WARN_NO_PARAMDOC option can be enabled to get warnings for
-# functions that are documented, but have no documentation for their parameters
-# or return value. If set to NO (the default) doxygen will only warn about
-# wrong or incomplete parameter documentation, but not about the absence of
-# documentation.
-
-WARN_NO_PARAMDOC = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that
-# doxygen can produce. The string should contain the $file, $line, and $text
-# tags, which will be replaced by the file and line number from which the
-# warning originated and the warning text. Optionally the format may contain
-# $version, which will be replaced by the version of the file (if it could
-# be obtained via FILE_VERSION_FILTER)
-
-WARN_FORMAT = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning
-# and error messages should be written. If left blank the output is written
-# to stderr.
-
-WARN_LOGFILE =
-
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag can be used to specify the files and/or directories that contain
-# documented source files. You may enter file names like "myfile.cpp" or
-# directories like "/usr/src/myproject". Separate the files or directories
-# with spaces.
-
-INPUT = ./inc/
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
-# also the default input encoding. Doxygen uses libiconv (or the iconv built
-# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
-# the list of possible encodings.
-
-INPUT_ENCODING = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank the following patterns are tested:
-# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
-# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
-# *.f90 *.f *.for *.vhd *.vhdl
-
-FILE_PATTERNS = NE10.h
-
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories
-# should be searched for input files as well. Possible values are YES and NO.
-# If left blank NO is used.
-
-RECURSIVE = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-
-EXCLUDE = .git
-
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-
-EXCLUDE_SYMLINKS = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories. Note that the wildcards are matched
-# against the file with absolute path, so to exclude all test directories
-# for example use the pattern */test/*
-
-EXCLUDE_PATTERNS =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-
-EXCLUDE_SYMBOLS =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or
-# directories that contain example code fragments that are included (see
-# the \include command).
-
-EXAMPLE_PATH =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank all files are included.
-
-EXAMPLE_PATTERNS =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude
-# commands irrespective of the value of the RECURSIVE tag.
-# Possible values are YES and NO. If left blank NO is used.
-
-EXAMPLE_RECURSIVE = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or
-# directories that contain image that are included in the documentation (see
-# the \image command).
-
-IMAGE_PATH =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command <filter> <input-file>, where <filter>
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
-# input file. Doxygen will then use the output that the filter program writes
-# to standard output.
-# If FILTER_PATTERNS is specified, this tag will be
-# ignored.
-
-INPUT_FILTER =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis.
-# Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match.
-# The filters are a list of the form:
-# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
-# info on how filters are used. If FILTER_PATTERNS is empty or if
-# non of the patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will be used to filter the input files when producing source
-# files to browse (i.e. when SOURCE_BROWSER is set to YES).
-
-FILTER_SOURCE_FILES = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
-# and it is also possible to disable source filtering for a specific pattern
-# using *.ext= (so without naming a filter). This option only has effect when
-# FILTER_SOURCE_FILES is enabled.
-
-FILTER_SOURCE_PATTERNS =
-
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will
-# be generated. Documented entities will be cross-referenced with these sources.
-# Note: To get rid of all source code in the generated output, make sure also
-# VERBATIM_HEADERS is set to NO.
-
-SOURCE_BROWSER = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body
-# of functions and classes directly in the documentation.
-
-INLINE_SOURCES = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
-# doxygen to hide any special comment blocks from generated source code
-# fragments. Normal C and C++ comments will always remain visible.
-
-STRIP_CODE_COMMENTS = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES
-# then for each documented function all documented
-# functions referencing it will be listed.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES
-# then for each documented function all documented entities
-# called/used by that function will be listed.
-
-REFERENCES_RELATION = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
-# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
-# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
-# link to the source code.
-# Otherwise they will link to the documentation.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code
-# will point to the HTML generated by the htags(1) tool instead of doxygen
-# built-in source browser. The htags tool is part of GNU's global source
-# tagging system (see http://www.gnu.org/software/global/global.html). You
-# will need version 4.8.6 or higher.
-
-USE_HTAGS = NO
-
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
-# will generate a verbatim copy of the header file for each class for
-# which an include is specified. Set to NO to disable this.
-
-VERBATIM_HEADERS = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
-# of all compounds will be generated. Enable this if the project
-# contains a lot of classes, structs, unions or interfaces.
-
-ALPHABETICAL_INDEX = YES
-
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX = 5
-
-# In case all classes in a project start with a common prefix, all
-# classes will be put under the same header in the alphabetical index.
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
-# should be ignored while generating the index headers.
-
-IGNORE_PREFIX =
-
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
-# generate HTML output.
-
-GENERATE_HTML = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `html' will be used as the default path.
-
-HTML_OUTPUT = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
-# doxygen will generate files with .html extension.
-
-HTML_FILE_EXTENSION = .html
-
-# The HTML_HEADER tag can be used to specify a personal HTML header for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard header.
-
-HTML_HEADER =
-
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard footer.
-
-HTML_FOOTER =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
-# style sheet that is used by each HTML page. It can be used to
-# fine-tune the look of the HTML output. If the tag is left blank doxygen
-# will generate a default style sheet. Note that doxygen will try to copy
-# the style sheet file to the HTML output directory, so don't put your own
-# stylesheet in the HTML output directory as well, or it will be erased!
-
-HTML_STYLESHEET =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
-# Doxygen will adjust the colors in the stylesheet and background images
-# according to this color. Hue is specified as an angle on a colorwheel,
-# see http://en.wikipedia.org/wiki/Hue for more information.
-# For instance the value 0 represents red, 60 is yellow, 120 is green,
-# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
-# The allowed range is 0 to 359.
-
-HTML_COLORSTYLE_HUE = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
-# the colors in the HTML output. For a value of 0 the output will use
-# grayscales only. A value of 255 will produce the most vivid colors.
-
-HTML_COLORSTYLE_SAT = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
-# the luminance component of the colors in the HTML output. Values below
-# 100 gradually make the output lighter, whereas values above 100 make
-# the output darker. The value divided by 100 is the actual gamma applied,
-# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
-# and 100 does not change the gamma.
-
-HTML_COLORSTYLE_GAMMA = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting
-# this to NO can help when comparing the output of multiple runs.
-
-HTML_TIMESTAMP = YES
-
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS = YES
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded. For this to work a browser that supports
-# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
-# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
-
-HTML_DYNAMIC_SECTIONS = NO
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files
-# will be generated that can be used as input for Apple's Xcode 3
-# integrated development environment, introduced with OSX 10.5 (Leopard).
-# To create a documentation set, doxygen will generate a Makefile in the
-# HTML output directory. Running make will produce the docset in that
-# directory and running "make install" will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
-# it at startup.
-# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-
-GENERATE_DOCSET = NO
-
-# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
-# feed. A documentation feed provides an umbrella under which multiple
-# documentation sets from a single provider (such as a company or product suite)
-# can be grouped.
-
-DOCSET_FEEDNAME = "Doxygen generated docs"
-
-# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
-# should uniquely identify the documentation set bundle. This should be a
-# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
-# will append .docset to the name.
-
-DOCSET_BUNDLE_ID = org.doxygen.Project
-
-# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-
-DOCSET_PUBLISHER_ID = org.doxygen.Publisher
-
-# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
-
-DOCSET_PUBLISHER_NAME = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files
-# will be generated that can be used as input for tools like the
-# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
-# of the generated HTML documentation.
-
-GENERATE_HTMLHELP = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
-# be used to specify the file name of the resulting .chm file. You
-# can add a path in front of the file if the result should not be
-# written to the html output directory.
-
-CHM_FILE =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
-# be used to specify the location (absolute path including file name) of
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
-# the HTML help compiler on the generated index.hhp.
-
-HHC_LOCATION =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
-# controls if a separate .chi index file is generated (YES) or that
-# it should be included in the master .chm file (NO).
-
-GENERATE_CHI = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
-# is used to encode HtmlHelp index (hhk), content (hhc) and project file
-# content.
-
-CHM_INDEX_ENCODING =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
-# controls whether a binary table of contents is generated (YES) or a
-# normal table of contents (NO) in the .chm file.
-
-BINARY_TOC = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members
-# to the contents of the HTML help documentation and to the tree view.
-
-TOC_EXPAND = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
-# that can be used as input for Qt's qhelpgenerator to generate a
-# Qt Compressed Help (.qch) of the generated HTML documentation.
-
-GENERATE_QHP = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
-# be used to specify the file name of the resulting .qch file.
-# The path specified is relative to the HTML output folder.
-
-QCH_FILE =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating
-# Qt Help Project output. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#namespace
-
-QHP_NAMESPACE = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
-# Qt Help Project output. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#virtual-folders
-
-QHP_VIRTUAL_FOLDER = doc
-
-# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
-# add. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#custom-filters
-
-QHP_CUST_FILTER_NAME =
-
-# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see
-# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
-# Qt Help Project / Custom Filters</a>.
-
-QHP_CUST_FILTER_ATTRS =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's
-# filter section matches.
-# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
-# Qt Help Project / Filter Attributes</a>.
-
-QHP_SECT_FILTER_ATTRS =
-
-# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
-# be used to specify the location of Qt's qhelpgenerator.
-# If non-empty doxygen will try to run qhelpgenerator on the generated
-# .qhp file.
-
-QHG_LOCATION =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
-# will be generated, which together with the HTML files, form an Eclipse help
-# plugin. To install this plugin and make it available under the help contents
-# menu in Eclipse, the contents of the directory containing the HTML and XML
-# files needs to be copied into the plugins directory of eclipse. The name of
-# the directory within the plugins directory should be the same as
-# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
-# the help appears.
-
-GENERATE_ECLIPSEHELP = NO
-
-# A unique identifier for the eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have
-# this name.
-
-ECLIPSE_DOC_ID = org.doxygen.Project
-
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
-# top of each HTML page. The value NO (the default) enables the index and
-# the value YES disables it.
-
-DISABLE_INDEX = NO
-
-# This tag can be used to set the number of enum values (range [0,1..20])
-# that doxygen will group on one line in the generated HTML documentation.
-# Note that a value of 0 will completely suppress the enum values from appearing in the overview section.
-
-ENUM_VALUES_PER_LINE = 4
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information.
-# If the tag value is set to YES, a side panel will be generated
-# containing a tree-like index structure (just like the one that
-# is generated for HTML Help). For this to work a browser that supports
-# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
-# Windows users are probably better off using the HTML help feature.
-
-GENERATE_TREEVIEW = NO
-
-# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
-# and Class Hierarchy pages using a tree view instead of an ordered list.
-
-USE_INLINE_TREES = NO
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
-# used to set the initial width (in pixels) of the frame in which the tree
-# is shown.
-
-TREEVIEW_WIDTH = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
-# links to external symbols imported via tag files in a separate window.
-
-EXT_LINKS_IN_WINDOW = NO
-
-# Use this tag to change the font size of Latex formulas included
-# as images in the HTML documentation. The default is 10. Note that
-# when you change the font size after a successful doxygen run you need
-# to manually remove any form_*.png images from the HTML output directory
-# to force them to be regenerated.
-
-FORMULA_FONTSIZE = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are
-# not supported properly for IE 6.0, but are supported on all modern browsers.
-# Note that when changing this option you need to delete any form_*.png files
-# in the HTML output before the changes have effect.
-
-FORMULA_TRANSPARENT = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
-# (see http://www.mathjax.org) which uses client side Javascript for the
-# rendering instead of using prerendered bitmaps. Use this if you do not
-# have LaTeX installed or if you want to formulas look prettier in the HTML
-# output. When enabled you also need to install MathJax separately and
-# configure the path to it using the MATHJAX_RELPATH option.
-
-USE_MATHJAX = NO
-
-# When MathJax is enabled you need to specify the location relative to the
-# HTML output directory using the MATHJAX_RELPATH option. The destination
-# directory should contain the MathJax.js script. For instance, if the mathjax
-# directory is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the mathjax.org site, so you can quickly see the result without installing
-# MathJax, but it is strongly recommended to install a local copy of MathJax
-# before deployment.
-
-MATHJAX_RELPATH = http://www.mathjax.org/mathjax
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box
-# for the HTML output. The underlying search engine uses javascript
-# and DHTML and should work on any modern browser. Note that when using
-# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
-# (GENERATE_DOCSET) there is already a search function so this one should
-# typically be disabled. For large projects the javascript based search engine
-# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
-
-SEARCHENGINE = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a PHP enabled web server instead of at the web client
-# using Javascript. Doxygen will generate the search PHP script and index
-# file to put on the web server. The advantage of the server
-# based approach is that it scales better to large projects and allows
-# full text search. The disadvantages are that it is more difficult to setup
-# and does not have live searching capabilities.
-
-SERVER_BASED_SEARCH = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
-# generate Latex output.
-
-GENERATE_LATEX = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `latex' will be used as the default path.
-
-LATEX_OUTPUT = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked. If left blank `latex' will be used as the default command name.
-# Note that when enabling USE_PDFLATEX this option is only used for
-# generating bitmaps for formulas in the HTML output, but not in the
-# Makefile that is written to the output directory.
-
-LATEX_CMD_NAME = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
-# generate index for LaTeX. If left blank `makeindex' will be used as the
-# default command name.
-
-MAKEINDEX_CMD_NAME = makeindex
-
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
-# LaTeX documents. This may be useful for small projects and may help to
-# save some trees in general.
-
-COMPACT_LATEX = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used
-# by the printer. Possible values are: a4, letter, legal and
-# executive. If left blank a4wide will be used.
-
-PAPER_TYPE = a4
-
-# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
-# packages that should be included in the LaTeX output.
-
-EXTRA_PACKAGES =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
-# the generated latex document. The header should contain everything until
-# the first chapter. If it is left blank doxygen will generate a
-# standard header. Notice: only use this tag if you know what you are doing!
-
-LATEX_HEADER =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will
-# contain links (just like the HTML output) instead of page references
-# This makes the output suitable for online browsing using a pdf viewer.
-
-PDF_HYPERLINKS = YES
-
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
-# plain latex in the generated Makefile. Set this option to YES to get a
-# higher quality PDF documentation.
-
-USE_PDFLATEX = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
-# command to the generated LaTeX files. This will instruct LaTeX to keep
-# running if errors occur, instead of asking the user for help.
-# This option is also used when generating formulas in HTML.
-
-LATEX_BATCHMODE = NO
-
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not
-# include the index chapters (such as File Index, Compound Index, etc.)
-# in the output.
-
-LATEX_HIDE_INDICES = NO
-
-# If LATEX_SOURCE_CODE is set to YES then doxygen will include
-# source code with syntax highlighting in the LaTeX output.
-# Note that which sources are shown also depends on other settings
-# such as SOURCE_BROWSER.
-
-LATEX_SOURCE_CODE = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
-# The RTF output is optimized for Word 97 and may not look very pretty with
-# other RTF readers or editors.
-
-GENERATE_RTF = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `rtf' will be used as the default path.
-
-RTF_OUTPUT = rtf
-
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
-# RTF documents. This may be useful for small projects and may help to
-# save some trees in general.
-
-COMPACT_RTF = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
-# will contain hyperlink fields. The RTF file will
-# contain links (just like the HTML output) instead of page references.
-# This makes the output suitable for online browsing using WORD or other
-# programs which support those fields.
-# Note: wordpad (write) and others do not support links.
-
-RTF_HYPERLINKS = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's
-# config file, i.e. a series of assignments. You only have to provide
-# replacements, missing definitions are set to their default value.
-
-RTF_STYLESHEET_FILE =
-
-# Set optional variables used in the generation of an rtf document.
-# Syntax is similar to doxygen's config file.
-
-RTF_EXTENSIONS_FILE =
-
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
-# generate man pages
-
-GENERATE_MAN = YES
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `man' will be used as the default path.
-
-MAN_OUTPUT = man
-
-# The MAN_EXTENSION tag determines the extension that is added to
-# the generated man pages (default is the subroutine's section .3)
-
-MAN_EXTENSION = .3
-
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
-# then it will generate one additional man file for each entity
-# documented in the real man page(s). These additional files
-# only source the real man page, but without them the man command
-# would be unable to find the correct page. The default is NO.
-
-MAN_LINKS = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES Doxygen will
-# generate an XML file that captures the structure of
-# the code including all documentation.
-
-GENERATE_XML = YES
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `xml' will be used as the default path.
-
-XML_OUTPUT = xml
-
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD =
-
-# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
-# dump the program listings (including syntax highlighting
-# and cross-referencing information) to the XML output. Note that
-# enabling this will significantly increase the size of the XML output.
-
-XML_PROGRAMLISTING = YES
-
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
-# generate an AutoGen Definitions (see autogen.sf.net) file
-# that captures the structure of the code including all
-# documentation. Note that this feature is still experimental
-# and incomplete at the moment.
-
-GENERATE_AUTOGEN_DEF = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will
-# generate a Perl module file that captures the structure of
-# the code including all documentation. Note that this
-# feature is still experimental and incomplete at the
-# moment.
-
-GENERATE_PERLMOD = NO
-
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
-# the necessary Makefile rules, Perl scripts and LaTeX code to be able
-# to generate PDF and DVI output from the Perl module output.
-
-PERLMOD_LATEX = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
-# nicely formatted so it can be parsed by a human reader.
-# This is useful
-# if you want to understand what is going on.
-# On the other hand, if this
-# tag is set to NO the size of the Perl module output will be much smaller
-# and Perl will parse it just the same.
-
-PERLMOD_PRETTY = YES
-
-# The names of the make variables in the generated doxyrules.make file
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
-# This is useful so different doxyrules.make files included by the same
-# Makefile don't overwrite each other's variables.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
-# evaluate all C-preprocessor directives found in the sources and include
-# files.
-
-ENABLE_PREPROCESSING = YES
-
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
-# names in the source code. If set to NO (the default) only conditional
-# compilation will be performed. Macro expansion can be done in a controlled
-# way by setting EXPAND_ONLY_PREDEF to YES.
-
-MACRO_EXPANSION = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
-# then the macro expansion is limited to the macros specified with the
-# PREDEFINED and EXPAND_AS_DEFINED tags.
-
-EXPAND_ONLY_PREDEF = NO
-
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
-# in the INCLUDE_PATH (see below) will be search if a #include is found.
-
-SEARCH_INCLUDES = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by
-# the preprocessor.
-
-INCLUDE_PATH =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will
-# be used.
-
-INCLUDE_FILE_PATTERNS =
-
-# The PREDEFINED tag can be used to specify one or more macro names that
-# are defined before the preprocessor is started (similar to the -D option of
-# gcc). The argument of the tag is a list of macros of the form: name
-# or name=definition (no spaces). If the definition and the = are
-# omitted =1 is assumed. To prevent a macro definition from being
-# undefined via #undef or recursively expanded use the := operator
-# instead of the = operator.
-
-PREDEFINED =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
-# this tag can be used to specify a list of macro names that should be expanded.
-# The macro definition that is found in the sources will be used.
-# Use the PREDEFINED tag if you want to use a different macro definition that overrules the definition found in the source code.
-
-EXPAND_AS_DEFINED =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
-# doxygen's preprocessor will remove all references to function-like macros
-# that are alone on a line, have an all uppercase name, and do not end with a
-# semicolon, because these will confuse the parser if not removed.
-
-SKIP_FUNCTION_MACROS = YES
-
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES option can be used to specify one or more tagfiles.
-# Optionally an initial location of the external documentation
-# can be added for each tagfile. The format of a tag file without
-# this location is as follows:
-#
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-#
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where "loc1" and "loc2" can be relative or absolute paths or
-# URLs. If a location is present for each tag, the installdox tool
-# does not have to be run to correct the links.
-# Note that each tag file must have a unique name
-# (where the name does NOT include the path)
-# If a tag file is not located in the directory in which doxygen
-# is run, you must also specify the path to the tagfile here.
-
-TAGFILES =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create
-# a tag file that is based on the input files it reads.
-
-GENERATE_TAGFILE =
-
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed
-# in the class index. If set to NO only the inherited external classes
-# will be listed.
-
-ALLEXTERNALS = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will
-# be listed.
-
-EXTERNAL_GROUPS = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option also works with HAVE_DOT disabled, but it is recommended to
-# install and use dot, since it yields more powerful graphs.
-
-CLASS_DIAGRAMS = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see
-# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH =
-
-# If set to YES, the inheritance and collaboration graphs will hide
-# inheritance and usage relations if the target is undocumented
-# or is not a class.
-
-HIDE_UNDOC_RELATIONS = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz, a graph visualization
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section
-# have no effect if this option is set to NO (the default)
-
-HAVE_DOT = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
-# allowed to run in parallel. When set to 0 (the default) doxygen will
-# base this on the number of processors available in the system. You can set it
-# explicitly to a value larger than 0 to get control over the balance
-# between CPU load and processing speed.
-
-DOT_NUM_THREADS = 0
-
-# By default doxygen will write a font called Helvetica to the output
-# directory and reference it in all dot files that doxygen generates.
-# When you want a differently looking font you can specify the font name
-# using DOT_FONTNAME. You need to make sure dot is able to find the font,
-# which can be done by putting it in a standard location or by setting the
-# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
-# containing the font.
-
-DOT_FONTNAME = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
-# The default size is 10pt.
-
-DOT_FONTSIZE = 10
-
-# By default doxygen will tell dot to use the output directory to look for the
-# FreeSans.ttf font (which doxygen will put there itself). If you specify a
-# different font using DOT_FONTNAME you can set the path where dot
-# can find it using this tag.
-
-DOT_FONTPATH =
-
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
-
-CLASS_GRAPH = YES
-
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect implementation dependencies (inheritance, containment, and
-# class references variables) of the class with other documented classes.
-
-COLLABORATION_GRAPH = YES
-
-# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for groups, showing the direct groups dependencies
-
-GROUP_GRAPHS = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-
-UML_LOOK = NO
-
-# If set to YES, the inheritance and collaboration graphs will show the
-# relations between templates and their instances.
-
-TEMPLATE_RELATIONS = NO
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
-# tags are set to YES then doxygen will generate a graph for each documented
-# file showing the direct and indirect include dependencies of the file with
-# other documented files.
-
-INCLUDE_GRAPH = YES
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
-# documented header file showing the documented files that directly or
-# indirectly include this file.
-
-INCLUDED_BY_GRAPH = YES
-
-# If the CALL_GRAPH and HAVE_DOT options are set to YES then
-# doxygen will generate a call dependency graph for every global function
-# or class method. Note that enabling this option will significantly increase
-# the time of a run. So in most cases it will be better to enable call graphs
-# for selected functions only using the \callgraph command.
-
-CALL_GRAPH = NO
-
-# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
-# doxygen will generate a caller dependency graph for every global function
-# or class method. Note that enabling this option will significantly increase
-# the time of a run. So in most cases it will be better to enable caller
-# graphs for selected functions only using the \callergraph command.
-
-CALLER_GRAPH = NO
-
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
-# will generate a graphical hierarchy of all classes instead of a textual one.
-
-GRAPHICAL_HIERARCHY = YES
-
-# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
-# then doxygen will show the dependencies a directory has on other directories
-# in a graphical way. The dependency relations are determined by the #include
-# relations between the files in the directories.
-
-DIRECTORY_GRAPH = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. Possible values are png, svg, gif or svg.
-# If left blank png will be used.
-
-DOT_IMAGE_FORMAT = png
-
-# The tag DOT_PATH can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-
-DOT_PATH =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the
-# \dotfile command).
-
-DOTFILE_DIRS =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the
-# \mscfile command).
-
-MSCFILE_DIRS =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
-# nodes that will be shown in the graph. If the number of nodes in a graph
-# becomes larger than this value, doxygen will truncate the graph, which is
-# visualized by representing a node as a red box. Note that doxygen if the
-# number of direct children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
-# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-
-DOT_GRAPH_MAX_NODES = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
-# graphs generated by dot. A depth value of 3 means that only nodes reachable
-# from the root by following a path via at most 3 edges will be shown. Nodes
-# that lay further from the root node will be omitted. Note that setting this
-# option to 1 or 2 may greatly reduce the computation time needed for large
-# code bases. Also note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-
-MAX_DOT_GRAPH_DEPTH = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not
-# seem to support this out of the box. Warning: Depending on the platform used,
-# enabling this option may lead to badly anti-aliased labels on the edges of
-# a graph (i.e. they become hard to read).
-
-DOT_TRANSPARENT = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10)
-# support this, this feature is disabled by default.
-
-DOT_MULTI_TARGETS = YES
-
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
-# generate a legend page explaining the meaning of the various boxes and
-# arrows in the dot generated graphs.
-
-GENERATE_LEGEND = YES
-
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
-# remove the intermediate dot files that are used to generate
-# the various graphs.
-
-DOT_CLEANUP = YES
+++ /dev/null
-#!/bin/sh
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# NE10 Library : getlog.sh
-#
-
-echo "NE10 NIGHTLY BUILD SCRIPT"
-echo "(C) 2011, ARM Ltd."
-date
-
-echo
-echo
-echo -e "\033[4mSYSTEM:\033[0m"
-uname -a
-cat /proc/cpuinfo
-
-echo
-echo
-echo -e "\033[4mINSTALLED TOOLS:\033[0m"
-echo "git:"
-if [ "`which git`" = "" ]; then
- echo "fatal: 'git' is not installed on this system" 1>&2
- exit 1
-fi
-git --version | paste -s -d ';' -
-echo
-echo "gcc:"
-if [ "`which gcc`" = "" ]; then
- echo "fatal: 'gcc' is not installed on this system" 1>&2
- exit 1
-fi
-gcc --version | paste -s -d ';' -
-echo
-echo "as:"
-if [ "`which as`" = "" ]; then
- echo "fatal: 'as' is not installed on this system" 1>&2
- exit 1
-fi
-as --version | paste -s -d ';' -
-echo
-echo "ar:"
-if [ "`which ar`" = "" ]; then
- echo "fatal: 'ar' is not installed on this system" 1>&2
- exit 1
-fi
-ar --version | paste -s -d ';' -
-echo
-echo
-echo "perl:"
-if [ "`which perl`" = "" ]; then
- echo "fatal: 'perl' is not installed on this system" 1>&2
- exit 1
-fi
-perl --version | paste -s -d ';' -
-
-echo
-if [ -e .git ]; then
- echo
- echo -e "\033[4mCURRENT 'git' CONFIGURATION:\033[0m"
- git config -l
-fi
-
-echo
-echo
-echo -e "\033[4mCURRENT USER AND PATH:\033[0m"
-echo `whoami` "@" `pwd`
-
-echo
-echo
-echo -e "\033[4mENVIRONMENT VARIABLES:\033[0m"
-echo
-echo "PATH = " $PATH
-echo
-echo "LD_LIBRARY_PATH = " $LD_LIBRARY_PATH
-
-
-echo
-if [ -e .git ]; then
-echo
-echo -e "\033[4mCURRENT GIT/SOURCE STATUS:\033[0m"
- git show
-fi
-
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/NE10_random.h
- */
-
-
-#ifndef NE10_RANDOM
-#define NE10_RANDOM
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <float.h>
-#include <math.h>
-
-// Please look at http://en.wikipedia.org/wiki/Linear_congruential_generator
-// According to this page, these values are the ones used in "glibc"
-
- //uint32_t _M = 4294967296L; // 2^32 // modulus, must be 0 < _M
- const uint32_t _A = 1103515245L; // a, must be 0 < _A < _M
- const uint32_t _C = 12345L; // c, must be 0 < _C < _M
- // uint32_t m_X_NM1 = 0; // X(n-1), at first this value is the seed or the start value
-
-// used for creating different instances of random number generators with different seeds and states
-typedef struct
-{
- // these are used as internal values, please do not change them directly
- uint32_t _private_m_A ;// = 1103515245L; // a, must be 0 < _A < _M
- uint32_t _private_m_C ;// = 12345L; // c, must be 0 < _C < _M
- uint32_t _private_m_X_NM1 ;// = 0; // X(n-1), at first this value is the seed or the start value
-} NE10_rng_t;
-
-typedef struct
-{
- // these are used as internal values, please do not change them directly
- // there are three separate seeds for 1) the sign, 2) the exponent, 3) and the fraction bits.
- NE10_rng_t _private_m_rngs[3];
-} NE10_float_rng_t;
-
-
-// generic functions
-void NE10_rng_init_g(NE10_rng_t *rng, uint32_t seed)
-{
- assert( rng != NULL );
- rng->_private_m_A = _A;
- rng->_private_m_C = _C;
- rng->_private_m_X_NM1 = seed;
-}
-
-uint32_t NE10_rng_next_g(NE10_rng_t *rng)
-{
- assert( rng != NULL );
- // Linear Congruential Generator
- rng->_private_m_X_NM1 = ( rng->_private_m_A * rng->_private_m_X_NM1 + rng->_private_m_C ); // % _M; // excluded by the nature of using a 32-bit data type
- return rng->_private_m_X_NM1;
-}
-
-const uint32_t NE10_rng_max_g(NE10_rng_t *rng)
-{
- return 0xffffffff; // this is 2^32 - 1
-}
-
-
-
-// the same functions using a rng which is shared across the library
-static NE10_rng_t __NE10_rng; // used as the global random number generator shared across the library
-
-void NE10_rng_init(uint32_t seed)
-{
- NE10_rng_init_g( &__NE10_rng, seed );
-}
-
-uint32_t NE10_rng_next()
-{
- return NE10_rng_next_g( &__NE10_rng );
-}
-
-const uint32_t NE10_rng_max()
-{
- return NE10_rng_max_g(NULL);
-}
-
-
-
-// a random number generator that generates IEEE 754 float numbers
-
-// NAN_OR_INF is to check whether the value is a NAN or an INF
-#define NAN_OR_INF (0xFF << 23)
-#define IS_NAN_OR_INF(x) ( ((x & NAN_OR_INF) == NAN_OR_INF)?1:0 )
-
-#define EXPONENT_MASK 0x807FFFFF
-#define IS_SUBNORMAL(x) ( ((x & EXPONENT_MASK) == x)?1:0 )
-
-void NE10_float_rng_init_g(NE10_float_rng_t* float_rng, uint32_t seed)
-{
- // we can use [0] for the fraction, [1] for the exponent, and [2] for the sign bit
-
- NE10_rng_t seed_generator;
- NE10_rng_init_g( &seed_generator, seed );
-
- NE10_rng_init_g( &float_rng->_private_m_rngs[0], NE10_rng_next_g( &seed_generator ) );
- NE10_rng_init_g( &float_rng->_private_m_rngs[1], NE10_rng_next_g( &seed_generator ) );
- NE10_rng_init_g( &float_rng->_private_m_rngs[2], NE10_rng_next_g( &seed_generator ) );
-}
-
-float NE10_float_rng_next_g(NE10_float_rng_t* float_rng)
-{
- uint32_t frc, exp, sgn, ret;
- float __ret;
-
- do
- {
- // generate three random numbers
- frc = NE10_rng_next_g( &float_rng->_private_m_rngs[0] );
- exp = NE10_rng_next_g( &float_rng->_private_m_rngs[1] );
- sgn = NE10_rng_next_g( &float_rng->_private_m_rngs[2] );
-
- // take the top bits ( the sign uses the 17th bit)
- frc = ( frc >> 9 ) & 0x7FFFFF ; // (1)b^23
- exp = ( ( exp >> 24 ) & 0x0000FF ) << 23; // (1)b^ 8
- sgn = ( ( sgn >> 16 ) & 0x000001 ) << 31;
-
- // generate the final float value
- ret = frc | exp | sgn;
-
- } while ( IS_NAN_OR_INF(ret) || IS_SUBNORMAL(ret) );
-
- memcpy( &__ret, &ret, 1*sizeof(float) );
- return __ret;
-}
-
-float NE10_float_rng_max_g(NE10_float_rng_t* float_rng)
-{
- return FLT_MAX;
-}
-
-
-// the same functions using a float_rng which is shared across the library
-
-static NE10_float_rng_t __NE10_float_rng; // local array for internal use only
-
-void NE10_float_rng_init(uint32_t seed)
-{
- NE10_float_rng_init_g( &__NE10_float_rng , seed );
-}
-
-float NE10_float_rng_next()
-{
- return NE10_float_rng_next_g( &__NE10_float_rng );
-}
-
-float NE10_float_rng_max()
-{
- return NE10_float_rng_max_g(NULL);
-}
-
-// the same as above functions except the range of values are limited
-
-#define IS_TOO_SMALL(f) ((fabs(f)<1.0e-6)?1:0)
-#define IS_TOO_BIG(f) ((fabs(f)>1.0e12)?1:0)
-
-static NE10_float_rng_t __NE10_float_rng_limit; // local array for internal use only
-
-void NE10_float_rng_limit_init(uint32_t seed)
-{
- NE10_float_rng_init_g( &__NE10_float_rng_limit , seed );
-}
-
-float NE10_float_rng_limit_next()
-{
- float ret = 0.0f;
-
- do
- {
- ret = NE10_float_rng_next_g( &__NE10_float_rng_limit );
- } while ( IS_TOO_SMALL(ret) || IS_TOO_BIG(ret) );
-
- return ret;
-}
-
-float NE10_float_rng_limit_max()
-{
- return NE10_float_rng_max_g(NULL);
-}
-
-// the same as above functions except the range of values are limited and all the values are greater than 1.0e-6
-
-#define IS_TOO_SMALL_GT1(f) ((fabs(f)<1.0e-6)?1:0)
-#define IS_TOO_BIG_GT1(f) ((fabs(f)>1.0e+3)?1:0)
-
-static NE10_float_rng_t __NE10_float_rng_limit_gt1; // local array for internal use only
-
-void NE10_float_rng_limit_gt1_init(uint32_t seed)
-{
- NE10_float_rng_init_g( &__NE10_float_rng_limit , seed );
-}
-
-float NE10_float_rng_limit_gt1_next()
-{
- float ret = 0.0f;
-
- do
- {
- ret = NE10_float_rng_next_g( &__NE10_float_rng_limit );
- } while ( IS_TOO_SMALL_GT1(ret) || IS_TOO_BIG_GT1(ret) );
-
- return ret;
-}
-
-float NE10_float_rng_limit_gt1_max()
-{
- return NE10_float_rng_max_g(NULL);
-}
-
-#endif // NE10_RANDOM
-
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : headers/NE10header.s
-@
-
-.include "headers/versionheader.s"
-
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@ constant values that are used across the library
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .equ NE10_OK, 0
- .equ NE10_ERR, -1
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/factor.h
- */
-
-// Typebuilding MACROs
-// - Slight difference between toolchain versions on intrinsics
-#define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \
- {{ \
- {x1, y1}, {x2,y2}, {x3,y3} \
- }}
-
-// Unit test use this macro to index into their function table
-// "opc" stands for operation's code (which function),
-// and "imp" stands for implementation (which implementation of the function)
-#define FTBL_IDX(opc, imp) ((opc-1)*IMPL_COUNT+(imp-1))
-
-// This macro helps measure the performance of the code passed to it through the "code" argument
-// It is used in the unit tests
-#define MEASURE(res, code) \
- { \
- gettimeofday (&before, &zone); \
- code \
- gettimeofday (&after, &zone); \
- if (before.tv_usec > after.tv_usec) \
- { \
- after.tv_usec += 1000000; \
- after.tv_sec--; \
- } \
- lapsed.tv_usec = after.tv_usec - before.tv_usec; \
- lapsed.tv_sec = after.tv_sec - before.tv_sec; \
- res = lapsed.tv_sec + ((double)lapsed.tv_usec / 1000000.0); \
- }
-
-// There are several categories of functions that share common code:
-
-// Different groups of functions take different number of inputs
-//
-// Group 1 = Functions that take a dst, a src, and a cst ("DstSrcCst" for short)
-// Group 2 = Those that take a dst, an acc, a src, and a cst ("DstAccSrcCst" for short)
-// Group 3 = The ones that take a dst, and a cst only ("DstCst" for short)
-//
-// Group 4 = These take a dst, and two src inputs, src2 and scr2 ("DstSrc1Src2")
-// Group 5 = These take a dst, an acc, and two src inputs ("DstAccSrc1Src2")
-// Group 6 = These take a dst, and a src ("DstSrc")
-//
-
-// The naming convention used in the following macros is as follows:
-// SNAPP_<A>_OPERATION_<T>_<I>
-// where
-// <A> Stands for the title of the operation (add, mul, etc) followed by its type (C = const as in addc).
-// The letter X - if used - means any such operation.
-// <T> Indicates the type of the operation (float, vec2, etc.)
-// The letter X - is used - means any type.
-// <I> This indicates the implementation (it can be C, ASM, or NEON).
-
-// A few macros to check pointers and their address range to make sure there's
-// no unwanted overlap between any two of them
-#define NE10_CHECKPOINTER_DstSrcCst_OPERATION \
- if ( dst < src ) \
- { assert ( dst + count <= src ); } \
- else if ( dst > src ) \
- { assert ( src + count <= dst ); }
-
-#define NE10_CHECKPOINTER_DstSrc_OPERATION NE10_CHECKPOINTER_DstSrcCst_OPERATION
-
-#define NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
- if ( arg1 < arg2 ) \
- { assert ( arg1 + count <= arg2 ); } \
- else if ( arg1 > arg2 ) \
- { assert ( arg2 + count <= arg1 ); } \
- if ( arg1 < arg3 ) \
- { assert ( arg1 + count <= arg3 ); } \
- else if ( arg1 > arg3 ) \
- { assert ( arg3 + count <= arg1 ); } \
- if ( arg3 < arg2 ) \
- { assert ( arg3 + count <= arg2 ); } \
- else if ( arg3 > arg2 ) \
- { assert ( arg2 + count <= arg3 ); }
-
-#define NE10_CHECKPOINTER_4POINTER_OPERATION(arg1, arg2, arg3, arg4) \
- NE10_CHECKPOINTER_3POINTER_OPERATION(arg1, arg2, arg3) \
- if ( arg1 < arg4 ) \
- { assert ( arg1 + count <= arg4 ); } \
- else if ( arg1 > arg4 ) \
- { assert ( arg4 + count <= arg1 ); } \
- if ( arg2 < arg4 ) \
- { assert ( arg2 + count <= arg4 ); } \
- else if ( arg2 > arg4 ) \
- { assert ( arg4 + count <= arg2 ); } \
- if ( arg4 < arg3 ) \
- { assert ( arg4 + count <= arg3 ); } \
- else if ( arg4 > arg3 ) \
- { assert ( arg3 + count <= arg4 ); }
-
-
-
-#define NE10_CHECKPOINTER_DstAccSrcCst_OPERATION { \
- NE10_CHECKPOINTER_3POINTER_OPERATION(dst, acc, src); }
-
-#define NE10_CHECKPOINTER_DstCst_OPERATION {}
-
-#define NE10_CHECKPOINTER_DstSrc1Src2_OPERATION { \
- NE10_CHECKPOINTER_3POINTER_OPERATION(dst, src1, src2); }
-
-#define NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION { \
- NE10_CHECKPOINTER_4POINTER_OPERATION(dst, acc, src1, src2); }
-
-// These macros generalise implementation of the functions.
-
-// Macros used in C implementations
-#define NE10_TEMPLATE_XC_OPERATION_X_C(checkPointer, loopCode) { \
- arm_result_t res = NE10_OK; \
- unsigned int itr = 0; \
- checkPointer; \
- for ( itr = 0; itr < count; itr++ ) \
- { loopCode ; /* this loop iterates through each and every float item one at a time */ \
- } \
- return res; \
- }
-
-// macros used in the NEON implementations
-
-// Main Loop = The loop where the number of items to be processed is exactly the
-// number that we can process in a single iteration.
-//
-// Secondary Loop = The loop that follows a Main Loop to fill in the entries that
-// did not fit into the Main Loop. This is needed when the number of
-// input items is not a multiple of the number of items that we
-// process in every iteration of the Main Loop.
-
-
-/****************************************************
- * *
- * The "DstSrcCst" group of functions *
- * *
- ****************************************************/
-
-///// - FLOAT - /////
-
-#define NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
- /* load 4 values */ \
- n_src = vld1q_f32( (float32_t*)src ); \
- src += 4; /* move to the next 4 float items; 4*float */ \
- loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
- dst += 4; /* move to the next items; 4*float */ \
- }
-
-#define NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
- float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
- float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
- n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d0 */ \
- loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
- vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
- /* move to the next item in the stream */ \
- src++; \
- dst++; \
- }
-
-#define NE10_DstSrcCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_src; \
- float32x4_t n_dst; \
- checkPointer; \
- int dif = 0; \
- dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
- for (; count > dif; count -= 4) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- unsigned int idx; \
- for ( idx = 0 ; idx < dif; idx++ ) { \
- loopCode2; \
- } \
- } \
- return res; \
- }
-
-///// - VEC2F - /////
-
-#define NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
- n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
- src += 2; /* move to the next two vectors */ \
- loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
- dst += 2; /* move to the next 2 vectors */ \
- }
-
-#define NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
- float32x2_t n_tmp_src; \
- float32x2_t n_tmp_cst = { cst->x, cst->y }; \
- n_tmp_src = vld1_f32( (float32_t*)src ); \
- loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
- vst1_f32( (float32_t*)dst, n_tmp_src); \
- }
-
-#define NE10_DstSrcCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
- float32x4_t n_src; \
- float32x4_t n_dst; \
- checkPointer; \
- int dif = count % 2; \
- for (; count > dif; count -= 2) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- loopCode2; \
- } \
- return res; \
- }
-
-///// - VEC3F - /////
-
-#define NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
- n_src1 = vld1q_f32( (float32_t*)src ); \
- src = ((void*)src)+(4*sizeof(arm_float_t)); \
- n_src2 = vld1q_f32( (float32_t*)src ); \
- src = ((void*)src)+(4*sizeof(arm_float_t)); \
- n_src3 = vld1q_f32( (float32_t*)src ); \
- src = ((void*)src)+(4*sizeof(arm_float_t)); \
- loopCode; /* The main loop iterates through three 3D vectors each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst1 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- }
-
-#define NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
- float32x2x3_t n_tmp_src = FLOAT32_2x3( \
- 0.0f, 0.0f, 0.0f , 0.0f, 0.0f , 0.0f); \
- float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
- cst->x, 0, cst->y, 0, cst->z, 0); \
- n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
- loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
- vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
- src++; \
- dst++; \
- }
-
-#define NE10_DstSrcCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
- float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
- float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
- float32x4_t n_src1, n_src2, n_src3; \
- float32x4_t n_dst1, n_dst2, n_dst3; \
- checkPointer; \
- int dif = count % 4; \
- for (; count > dif; count -= 4) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- unsigned int idx; \
- for ( idx = 0 ; idx < dif; idx++ ) { \
- loopCode2; \
- } \
- } \
- return res; \
- }
-
-///// - VEC4F - /////
-
-/* Note that for the VEC4* types, we do not need a second loop as the number
- of input items is always a multiple of four. */
-
-#define NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
- n_src = vld1q_f32( (float32_t*)src ); \
- src ++; \
- loopCode; \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
- dst ++; \
- }
-
-#define NE10_DstSrcCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
- float32x4_t n_src; \
- float32x4_t n_dst; \
- checkPointer; \
- for (; count != 0; count --) { \
- loopCode; \
- } \
- return res; \
- }
-
-/****************************************************
- * *
- * The "DstAccSrcCst" group of functions *
- * *
- ****************************************************/
-
-///// - FLOAT - /////
-
-#define NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode) { \
- /* load 4 values */ \
- n_acc = vld1q_f32( (float32_t*)acc ); \
- n_src = vld1q_f32( (float32_t*)src ); \
- acc += 4; /* move to the next 4 float items; 4*float */ \
- src += 4; \
- loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
- dst += 4; /* move to the next items; 4*float */ \
- }
-
-#define NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
- float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
- float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
- float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
- n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
- n_tmp_src = vld1_lane_f32 ( (float32_t*)src, n_tmp_src, 0); /* load into the first lane of d1 */ \
- loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
- vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
- /* move to the next item in the stream */ \
- acc++; \
- src++; \
- dst++; \
- }
-
-#define NE10_DstAccSrcCst_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
-
-///// - VEC2F - /////
-
-#define NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode) { \
- n_acc = vld1q_f32( (float32_t*)acc ); /* load two vectors */ \
- n_src = vld1q_f32( (float32_t*)src ); /* load two vectors */ \
- acc += 2; /* move to the next two vectors */ \
- src += 2; \
- loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* store back */ \
- dst += 2; /* move to the next 2 vectors */ \
- }
-
-#define NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
- float32x2_t n_tmp_acc; \
- float32x2_t n_tmp_src; \
- float32x2_t n_tmp_cst = { cst->x, cst->y }; \
- n_tmp_acc = vld1_f32( (float32_t*)acc ); \
- n_tmp_src = vld1_f32( (float32_t*)src ); \
- loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
- vst1_f32( (float32_t*)dst, n_tmp_src); \
- }
-
-#define NE10_DstAccSrcCst_OPERATION_VEC2F_NEON NE10_DstSrcCst_OPERATION_VEC2F_NEON
-
-///// - VEC3F - /////
-
-#define NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode) { \
- n_acc1 = vld1q_f32( (float32_t*)acc ); /* Load accumulator values */ \
- acc = ((void*)acc)+(4*sizeof(arm_float_t)); \
- n_acc2 = vld1q_f32( (float32_t*)acc ); \
- acc = ((void*)acc)+(4*sizeof(arm_float_t)); \
- n_acc3 = vld1q_f32( (float32_t*)acc ); \
- acc = ((void*)acc)+(4*sizeof(arm_float_t)); \
- n_src1 = vld1q_f32( (float32_t*)src ); /* Load source values */ \
- src = ((void*)src)+(4*sizeof(arm_float_t)); \
- n_src2 = vld1q_f32( (float32_t*)src ); \
- src = ((void*)src)+(4*sizeof(arm_float_t)); \
- n_src3 = vld1q_f32( (float32_t*)src ); \
- src = ((void*)src)+(4*sizeof(arm_float_t)); \
- loopCode; /* The main loop iterates through three 3D vectors each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst1 ); /* Store the results back into the memory */ \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- vst1q_f32 ( (float32_t*)dst , n_dst2 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- vst1q_f32 ( (float32_t*)dst , n_dst3 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- }
-
-#define NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
- float32x2x3_t n_tmp_acc = FLOAT32_2x3( \
- 0.0f, 0.0f, \
- 0.0f, 0.0f, \
- 0.0f, 0.0f \
- ); \
- float32x2x3_t n_tmp_src = FLOAT32_2x3( \
- 0.0f, 0.0f, \
- 0.0f, 0.0f, \
- 0.0f, 0.0f \
- ); \
- float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
- cst->x, 0, \
- cst->y, 0, \
- cst->z, 0 \
- ); \
- n_tmp_acc = vld3_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); \
- n_tmp_src = vld3_lane_f32 ( (float32_t*)src, n_tmp_src, 0); \
- loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
- vst3_lane_f32( (float32_t*)dst, n_tmp_src, 0); \
- acc++; \
- src++; \
- dst++; \
- }
-
-#define NE10_DstAccSrcCst_OPERATION_VEC3F_NEON NE10_DstSrcCst_OPERATION_VEC3F_NEON
-
-///// - VEC4F - /////
-
-#define NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode) { \
- n_acc = vld1q_f32( (float32_t*)acc ); \
- n_src = vld1q_f32( (float32_t*)src ); \
- acc ++; \
- src ++; \
- loopCode; \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* The main loop iterates through one 4D vector each time */ \
- dst ++; \
- }
-
-#define NE10_DstAccSrcCst_OPERATION_VEC4F_NEON NE10_DstSrcCst_OPERATION_VEC4F_NEON
-
-/****************************************************
- * *
- * The "DstCst" group of functions *
- * *
- ****************************************************/
-
-///// - FLOAT - /////
-
-#define NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode) { \
- /* load 4 values */ \
- loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
- vst1q_f32 ( (float32_t*)dst , n_cst ); /* store theresults back */ \
- dst += 4; /* move to the next items; 4*float */ \
- }
-
-#define NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode) { \
- float32x2_t n_tmp_cst = { cst, cst }; /* temporary constant value for use in the main NEON operation */ \
- loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
- vst1_lane_f32( (float32_t*)dst, n_tmp_cst, 0); /* store the lane back into the memory */ \
- /* move to the next item in the stream */ \
- dst++; \
- }
-
-#define NE10_DstCst_OPERATION_FLOAT_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- checkPointer; \
- int dif = 0; \
- dif = count % 4; /* either 0 or one of 1,2,3; in the latter cases the second path is taken */ \
- for (; count > dif; count -= 4) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- unsigned int idx; \
- for ( idx = 0 ; idx < dif; idx++ ) { \
- loopCode2; \
- } \
- } \
- return res; \
- }
-
-///// - VEC2F - /////
-
-
-#define NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode) { \
- loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
- vst1q_f32 ( (float32_t*)dst , n_cst ); /* store back */ \
- dst += 2; /* move to the next 2 vectors */ \
- }
-
-#define NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode) { \
- float32x2_t n_tmp_cst = { cst->x, cst->y }; \
- loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
- vst1_f32( (float32_t*)dst, n_tmp_cst); \
- }
-
-#define NE10_DstCst_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_cst = { cst->x, cst->y, cst->x, cst->y }; \
- checkPointer; \
- int dif = count % 2; \
- for (; count > dif; count -= 2) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- loopCode2; \
- } \
- return res; \
- }
-
-///// - VEC3F - /////
-
-#define NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode) { \
- loopCode; /* The main loop iterates through three 3D vectors each time */ \
- vst1q_f32 ( (float32_t*)dst , n_cst1 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- vst1q_f32 ( (float32_t*)dst , n_cst2 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- vst1q_f32 ( (float32_t*)dst , n_cst3 ); \
- dst = ((void*)dst)+(4*sizeof(arm_float_t)); \
- }
-
-#define NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode) { \
- float32x2x3_t n_tmp_cst = FLOAT32_2x3( \
- cst->x, 0, \
- cst->y, 0, \
- cst->z, 0 \
- ); \
- loopCode; /* exceptional cases where the count isn't a multiple of 3 */ \
- vst3_lane_f32( (float32_t*)dst, n_tmp_cst, 0); \
- dst++; \
- }
-
-#define NE10_DstCst_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_cst1 = { cst->x, cst->y, cst->z, cst->x }; \
- float32x4_t n_cst2 = { cst->y, cst->z, cst->x, cst->y }; \
- float32x4_t n_cst3 = { cst->z, cst->x, cst->y, cst->z }; \
- checkPointer; \
- int dif = count % 4; \
- for (; count > dif; count -= 4) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- unsigned int idx; \
- for ( idx = 0 ; idx < dif; idx++ ) { \
- loopCode2; \
- } \
- } \
- return res; \
- }
-
-///// - VEC4F - /////
-
-#define NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode) { \
- loopCode; \
- vst1q_f32 ( (float32_t*)dst , n_cst ); /* The main loop iterates through one 4D vector each time */ \
- dst ++; \
- }
-
-#define NE10_DstCst_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_cst = { cst->x, cst->y, cst->z, cst->w }; \
- checkPointer; \
- for (; count != 0; count --) { \
- loopCode; \
- } \
- return res; \
- }
-
-/****************************************************
- * *
- * The "DstSrc1Src2" group of functions *
- * *
- ****************************************************/
-
-///// - FLOAT - /////
-
-#define NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
- /* load 4 values */ \
- n_src = vld1q_f32( (float32_t*)src1 ); \
- src1 += 4; /* move to the next 4 float items; 4*float */ \
- n_src2 = vld1q_f32( (float32_t*)src2 ); \
- src2 += 4; /* move to the next 4 float items; 4*float */ \
- loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* store the results back */ \
- dst += 4; /* move to the next items; 4*float */ \
- }
-
-#define NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
- float32x2_t n_tmp_src = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
- float32x2_t n_tmp_src2 = { 0.0f , 0.0f }; \
- n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d0 */ \
- n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src, 0); \
- loopCode; /* the actual operation is placed here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
- vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
- /* move to the next item in the stream */ \
- src1++; \
- src2++; \
- dst++; \
- }
-
-#define NE10_DstSrc1Src2_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
-
-/****************************************************
- * *
- * The "DstAccSrc1Src2" group of functions *
- * *
- ****************************************************/
-
-///// - FLOAT - /////
-
-#define NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode) { \
- /* load 4 values */ \
- n_acc = vld1q_f32( (float32_t*)acc ); \
- n_src = vld1q_f32( (float32_t*)src1 ); \
- n_src2 = vld1q_f32( (float32_t*)src2 ); \
- acc += 4; /* move to the next 4 float items; 4*float */ \
- src1 += 4; \
- src2 += 4; \
- loopCode; /* the actual operation is placed here... */ /* The main loop iterates through four float values each time */ \
- vst1q_f32 ( (float32_t*)dst , n_dst ); /* store theresults back */ \
- dst += 4; /* move to the next items; 4*float */ \
- }
-
-#define NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode) { \
- float32x2_t n_tmp_acc = { 0.0f , 0.0f }; /* temporary storage to be used with NEON load/store intrinsics */ \
- float32x2_t n_tmp_src = { 0.0f , 0.0f }; \
- float32x2_t n_tmp_src2 = { 0.0f, 0.0f }; \
- n_tmp_acc = vld1_lane_f32 ( (float32_t*)acc, n_tmp_acc, 0); /* load into the first lane of d0 */ \
- n_tmp_src = vld1_lane_f32 ( (float32_t*)src1, n_tmp_src, 0); /* load into the first lane of d1 */ \
- n_tmp_src2 = vld1_lane_f32 ( (float32_t*)src2, n_tmp_src2, 0); /* load into the first lane of d2 */ \
- loopCode; /* the actual operation is palced here ... */ /* exceptional cases where the count is not a multiple of 4 */ \
- vst1_lane_f32( (float32_t*)dst, n_tmp_src, 0); /* store the lane back into the memory */ \
- /* move to the next item in the stream */ \
- acc++; \
- src1++; \
- src2++; \
- dst++; \
- }
-
-#define NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON NE10_DstAccSrcCst_OPERATION_FLOAT_NEON
-
-/****************************************************
- * *
- * The "DstSrc" group of functions *
- * *
- ****************************************************/
-
-///// - FLOAT - /////
-
-#define NE10_DstSrc_MAINLOOP_FLOAT_NEON NE10_DstSrcCst_MAINLOOP_FLOAT_NEON
-
-#define NE10_DstSrc_SECONDLOOP_FLOAT_NEON NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON
-
-#define NE10_DstSrc_OPERATION_FLOAT_NEON NE10_DstSrcCst_OPERATION_FLOAT_NEON
-
-///// - VEC2F - /////
-
-#define NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode) { \
- n_src = vld2_f32( (float32_t*)src ); /* load two vectors */ \
- src += 2; /* move to the next two vectors */ \
- loopCode; /* actual operation */ /* The main loop iterates through two 2D vectors each time */ \
- /* store the results and increment the destination pointer within the loopCode */ \
- }
-
-#define NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode) { \
- loopCode; /* exceptional cases where the count isn't a multiple of 2 */ \
- /* store the results within the loopCode */ \
- }
-
-#define NE10_DstSrc_OPERATION_VEC2F_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x2x2_t n_src; \
- float32x2_t n_dst; \
- checkPointer; \
- int dif = count % 2; \
- for (; count > dif; count -= 2) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- loopCode2; \
- } \
- return res; \
- }
-
-///// - VEC3F - /////
-
-#define NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode) { \
- n_src = vld3q_f32( (float32_t*)src ); \
- src = ((void*)src)+(12*sizeof(arm_float_t)); \
- loopCode; /* The main loop iterates through four 3D vectors each time */ \
- /* store the results and increment the destination pointer within the loopCode */ \
- }
-
-#define NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode) { \
- loopCode; /* exceptional cases where the count isn't a multiple of 4 */ \
- /* store the results within the loopCode */ \
- }
-
-#define NE10_DstSrc_OPERATION_VEC3F_NEON(checkPointer, loopCode1, loopCode2) { \
- arm_result_t res = NE10_OK; \
- float32x4x3_t n_src; \
- float32x4_t n_dst; \
- checkPointer; \
- int dif = count % 4; \
- for (; count > dif; count -= 4) { \
- loopCode1; \
- } \
- if ( 0 != dif ) { \
- unsigned int idx; \
- for ( idx = 0 ; idx < dif; idx++ ) { \
- loopCode2; \
- } \
- } \
- return res; \
- }
-
-///// - VEC4F - /////
-
-/* Note that for the VEC4* types, we do not need a second loop as the number
- of input items is always a multiple of four. */
-
-#define NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) { \
- n_src = vld1q_f32( (float32_t*)src ); \
- src ++; \
- loopCode; \
- /* store the results and increment the destination pointer within the loopCode */ \
- }
-
-#define NE10_DstSrc_OPERATION_VEC4F_NEON(checkPointer, loopCode) { \
- arm_result_t res = NE10_OK; \
- float32x4_t n_src; \
- checkPointer; \
- for (; count != 0; count --) { \
- loopCode; \
- } \
- return res; \
- }
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/macros.h
- */
-
-#include "factor.h"
-
-// Macros used in actual implementations
-
-///// The "DstSrcCst" group of functions - FLOAT /////
-
-#define NE10_XC_OPERATION_X_C(loopCode) { \
- NE10_TEMPLATE_XC_OPERATION_X_C( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
- loopCode); \
- }
-
-#define NE10_XC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
- float32x4_t n_cst = { cst, cst, cst, cst }; \
- NE10_DstSrcCst_OPERATION_FLOAT_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
- NE10_DstSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
- NE10_DstSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_XC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
- NE10_DstSrcCst_OPERATION_VEC2F_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
- NE10_DstSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
- NE10_DstSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
- ); \
- }
-
-/* This macro uses interleaving to boost the performance */
-#define NE10_XC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
- NE10_DstSrcCst_OPERATION_VEC3F_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
- NE10_DstSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
- NE10_DstSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_XC_OPERATION_VEC4F_NEON(loopCode) { \
- NE10_DstSrcCst_OPERATION_VEC4F_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION; , \
- NE10_DstSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
- ); \
- }
-
-///// The "DstAccSrcCst" group of functions - FLOAT //////
-
-#define NE10_MLAC_OPERATION_X_C(loopCode) { \
- NE10_TEMPLATE_XC_OPERATION_X_C( \
- NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
- loopCode); \
- }
-
-#define NE10_MLAC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
- float32x4_t n_acc; \
- float32x4_t n_cst = { cst, cst, cst, cst }; \
- NE10_DstAccSrcCst_OPERATION_FLOAT_NEON( \
- NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
- NE10_DstAccSrcCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
- NE10_DstAccSrcCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_MLAC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
- float32x4_t n_acc; \
- NE10_DstAccSrcCst_OPERATION_VEC2F_NEON( \
- NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
- NE10_DstAccSrcCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
- NE10_DstAccSrcCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_MLAC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
- float32x4_t n_acc1, n_acc2, n_acc3; \
- NE10_DstAccSrcCst_OPERATION_VEC3F_NEON( \
- NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
- NE10_DstAccSrcCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
- NE10_DstAccSrcCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_MLAC_OPERATION_VEC4F_NEON(loopCode) { \
- float32x4_t n_acc; \
- NE10_DstAccSrcCst_OPERATION_VEC4F_NEON( \
- NE10_CHECKPOINTER_DstAccSrcCst_OPERATION; , \
- NE10_DstAccSrcCst_MAINLOOP_VEC4F_NEON(loopCode); \
- ); \
- }
-
-///// The "DstCst" group of functions - FLOAT /////
-
-#define NE10_SETC_OPERATION_X_C(loopCode) { \
- NE10_TEMPLATE_XC_OPERATION_X_C( \
- NE10_CHECKPOINTER_DstCst_OPERATION; , \
- loopCode); \
- }
-
-#define NE10_SETC_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
- float32x4_t n_cst = { cst, cst, cst, cst }; \
- NE10_DstCst_OPERATION_FLOAT_NEON( \
- NE10_CHECKPOINTER_DstCst_OPERATION; , \
- NE10_DstCst_MAINLOOP_FLOAT_NEON(loopCode1); , \
- NE10_DstCst_SECONDLOOP_FLOAT_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_SETC_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
- NE10_DstCst_OPERATION_VEC2F_NEON( \
- NE10_CHECKPOINTER_DstCst_OPERATION; , \
- NE10_DstCst_MAINLOOP_VEC2F_NEON(loopCode1); , \
- NE10_DstCst_SECONDLOOP_VEC2F_NEON(loopCode2); \
- ); \
- }
-
-/* This macro uses interleaving to boost the performance */
-#define NE10_SETC_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
- NE10_DstCst_OPERATION_VEC3F_NEON( \
- NE10_CHECKPOINTER_DstCst_OPERATION; , \
- NE10_DstCst_MAINLOOP_VEC3F_NEON(loopCode1); , \
- NE10_DstCst_SECONDLOOP_VEC3F_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_SETC_OPERATION_VEC4F_NEON(loopCode) { \
- NE10_DstCst_OPERATION_VEC4F_NEON( \
- NE10_CHECKPOINTER_DstCst_OPERATION; , \
- NE10_DstCst_MAINLOOP_VEC4F_NEON(loopCode); \
- ); \
- }
-
-///// The "DstSrc1Src2" group of functions //////
-
-#define NE10_X_OPERATION_FLOAT_C(loopCode) { \
- NE10_TEMPLATE_XC_OPERATION_X_C( \
- NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
- loopCode); \
- }
-
-#define NE10_X_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
- float32x4_t n_src2; \
- NE10_DstSrc1Src2_OPERATION_FLOAT_NEON( \
- NE10_CHECKPOINTER_DstSrc1Src2_OPERATION; , \
- NE10_DstSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
- NE10_DstSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_DOT_OPERATION_X_C NE10_X_OPERATION_FLOAT_C
-
-///// The "DstSrc" group of functions //////
-
-#define NE10_ABS_OPERATION_X_C(loopCode) { \
- NE10_TEMPLATE_XC_OPERATION_X_C( \
- NE10_CHECKPOINTER_DstSrc_OPERATION, \
- loopCode); \
- }
-
-#define NE10_ABS_OPERATION_FLOAT_C NE10_ABS_OPERATION_X_C
-
-#define NE10_ABS_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
- arm_float_t cst = 0.0f; /* this is used to compare the values against. */ \
- float32x4_t n_cst = { cst, cst, cst, cst }; \
- NE10_DstSrc_OPERATION_FLOAT_NEON( \
- NE10_CHECKPOINTER_DstSrc_OPERATION; , \
- NE10_DstSrc_MAINLOOP_FLOAT_NEON(loopCode1); , \
- NE10_DstSrc_SECONDLOOP_FLOAT_NEON(loopCode2); \
- ); \
- }
-
-#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
-
-#define NE10_LEN_OPERATION_X_C NE10_ABS_OPERATION_X_C
-
-#define NE10_CMATVEC_OPERATION_X_C NE10_ABS_OPERATION_X_C
-
-#define NE10_LEN_OPERATION_VEC2F_NEON(loopCode1, loopCode2) { \
- NE10_DstSrc_OPERATION_VEC2F_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
- NE10_DstSrc_MAINLOOP_VEC2F_NEON(loopCode1), \
- NE10_DstSrc_SECONDLOOP_VEC2F_NEON(loopCode2) \
- ); \
- }
-
-#define NE10_LEN_OPERATION_VEC3F_NEON(loopCode1, loopCode2) { \
- NE10_DstSrc_OPERATION_VEC3F_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
- NE10_DstSrc_MAINLOOP_VEC3F_NEON(loopCode1), \
- NE10_DstSrc_SECONDLOOP_VEC3F_NEON(loopCode2) \
- ); \
- }
-
-#define NE10_LEN_OPERATION_VEC4F_NEON(loopCode) { \
- NE10_DstSrc_OPERATION_VEC4F_NEON( \
- NE10_CHECKPOINTER_DstSrcCst_OPERATION, \
- NE10_DstSrc_MAINLOOP_VEC4F_NEON(loopCode) \
- ); \
- }
-
-#define NE10_DETMAT_OPERATION_X_C NE10_ABS_OPERATION_X_C
-
-///// The "DstAccSrc1Src2" group of functions //////
-
-#define NE10_MLA_OPERATION_FLOAT_NEON(loopCode1, loopCode2) { \
- float32x4_t n_acc; \
- float32x4_t n_src2; \
- NE10_DstAccSrc1Src2_OPERATION_FLOAT_NEON( \
- NE10_CHECKPOINTER_DstAccSrc1Src2_OPERATION; , \
- NE10_DstAccSrc1Src2_MAINLOOP_FLOAT_NEON(loopCode1); , \
- NE10_DstAccSrc1Src2_SECONDLOOP_FLOAT_NEON(loopCode2); \
- ); \
- }
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_abs_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
-arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialize
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[2]); free(esp_buf[4]);
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_common.h
- */
-
-#ifndef __UNIT_TEST_COMMON__
-#define __UNIT_TEST_COMMON__
-
-// Make sure the following values are defined before including this header file:
-// 1- length of the data arrays
-// #define ARRLEN
-// 2- number of the operations in a given unit
-// #define OP_COUNT
-// 3- number of the different implementations of each of the functions (C, ASM, NEON, ...)
-// #define IMPL_COUNT
-#ifndef ARRLEN
- #error Pelease define ARRLEN
-#endif
-#ifndef OP_COUNT
- #error Please define OP_COUNT
-#endif
-#ifndef IMPL_COUNT
- #error Please define IMPL_COUNT
-#endif
-
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <math.h>
-
-#include <sys/types.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-
-#include "../headers/macros.h"
-#include "NE10.h"
-#include "../headers/NE10_random.h"
-
-// length of the test data arrays
-// A number that is not divisible by 2 3 and 4 so that all the
-// execution paths are tested; The larger the number the more
-// number of random values are stored in the array and passed
-// into the array as the input stream.
-// 2^11 + 3 = 2051, it is not divisible by 2, 3, or 4
-#define TEST_ARRLEN 2051
-#define TEST_ARRLEN_MATRICES 1051
-
-// NAN_OR_INF is to check whether the value is a NAN or an INF
-#define NAN_OR_INF (0xFF << 23)
-// The sign bit mask
-#define SIGNBIT_MASK 0x7FFFFFFF
-#define EXPONENT_MASK 0x807FFFFF
-
-// What's the acceptable error between the integer representations of two float values
-#define ERROR_MARGIN_SMALL 0x02
-#define ERROR_MARGIN_LARGE 0xFF
-
-// What's the acceptable number of warnings in a test
-#define ACCEPTABLE_WARNS 12
-#define ACCEPTABLE_WARNS_MATRICES 48
-
-inline void FILL_FLOAT_ARRAY( arm_float_t *arr, unsigned int count )
-{
- unsigned int i = 0;
-
- sleep ( 1 );
-
- NE10_float_rng_init( time(NULL) );
-
- for ( i = 0; i < count; i++ )
- {
- arr[i] = NE10_float_rng_next();
- }
-}
-
-inline void FILL_FLOAT_ARRAY_LIMIT( arm_float_t *arr, unsigned int count )
-{
- unsigned int i = 0;
-
- sleep ( 1 );
-
- NE10_float_rng_limit_init( time(NULL) );
-
- for ( i = 0; i < count; i++ )
- {
- arr[ i ] = NE10_float_rng_limit_next();
- }
-}
-
-inline void FILL_FLOAT_ARRAY_LIMIT_GT1( arm_float_t *arr, unsigned int count )
-{
- unsigned int i = 0;
-
- sleep ( 1 );
-
- NE10_float_rng_limit_gt1_init( time(NULL) );
-
- for ( i = 0; i < count; i++ )
- {
- arr[ i ] = NE10_float_rng_limit_gt1_next();
- }
-}
-
-// this function checks whether the difference between two float values is within the acceptable error range
-inline int EQUALS_FLOAT( float fa, float fb , unsigned int err )
-{
- union
- {
- int vi;
- float vf;
- } conv1, conv2;
-
- unsigned int ui1, ui2;
-
- if ( fa == fb ) return 1; // if identical, then return TRUE
-
- conv1.vf = fa;
- conv2.vf = fb;
-
- if ( (conv1.vi & NAN_OR_INF) == NAN_OR_INF )
- {
- fprintf( stderr, "HINT: The 1st floating-point value is either \'Not a number\' or \'Infinity\'. " );
- return 0; // INF or NAN, unacceptable return FALSE
- }
-
- if ( (conv2.vi & NAN_OR_INF) == NAN_OR_INF )
- {
- fprintf( stderr, "HINT: The 1st floating-point value is either \'Not a number\' or \'Infinity\'. " );
- return 0; // INF or NAN, unacceptable return FALSE
- }
-
- int cut1 = conv1.vi & SIGNBIT_MASK; // drop the sign bit - i.e. the left most bit
- int cut2 = conv2.vi & SIGNBIT_MASK;
-
- if ( (cut1 & EXPONENT_MASK) == cut1 ) { cut1 = 0; } // zero out subnormal float values
- if ( (cut2 & EXPONENT_MASK) == cut2 ) { cut2 = 0; } // zero out subnormal float values
-
- memcpy( &ui1, &fa, sizeof(arm_float_t) );
- memcpy( &ui2, &fb, sizeof(arm_float_t) );
-
- if ( abs( cut1 - cut2 ) > err ) // this is the log() of the actual error
- { // then we have an unacceptable error
-
- // report an unacceptable error
- fprintf( stderr, "HINT: %e (0x%04X) != %e (0x%04X) ", fa, ui1, fb, ui2 );
-
- return 0;
- }
-
- if ( fb*fa < 0.0f )
- {
-
- fprintf( stderr, "HINT: %e (0x%04X) is the opposite of %e (0x%04X) ", fa, ui1, fb, ui2 );
-
- return 0;
- }
-
- return 1; // acceptable, return TRUE
-}
-
-char ARRAY_GUARD_SIG[] = { 0x66, 0xAB, 0xCD, 0xAB,
- 0xCD, 0xAB, 0xCD, 0xAB,
- 0xCD, 0xAB, 0xCD, 0xAB,
- 0xCD, 0xAB, 0xCD, 0x99 };
-#define ARRAY_GUARD_LEN 16
-
-// this function adds a ARRAY_GUARD_LEN byte signature to the begining and the end of an array, minimum acceptable size for the array is 2*ARRAY_GUARD_LEN bytes.
-inline int GUARD_ARRAY( void* array, unsigned int array_length )
-{
- char* the_array = (char*) array;
- if ( array_length < (2*ARRAY_GUARD_LEN) ) return 0;
- memcpy( the_array, ARRAY_GUARD_SIG, ARRAY_GUARD_LEN );
- memcpy( &the_array[array_length-ARRAY_GUARD_LEN], ARRAY_GUARD_SIG, ARRAY_GUARD_LEN );
- return 1;
-}
-
-// this function returns TRUE if the signature matches the guard and returns FALSE otherwise
-inline int CHECK_ARRAY_GUARD( void* array, unsigned int array_length )
-{
- char* the_array = (char*) array;
- if ( strncmp(the_array, ARRAY_GUARD_SIG, ARRAY_GUARD_LEN) ) {
- fprintf( stderr, " ERROR: Array guard signature is wrong. \n" );
- return 0; // Match not found, return FALSE
- }
-
- if ( strncmp(&the_array[array_length-ARRAY_GUARD_LEN], ARRAY_GUARD_SIG, ARRAY_GUARD_LEN) ) {
- fprintf( stderr, " ERROR: Array guard signature is wrong. \n" );
- return 0; // Match not found, return FALSE
- }
-
- return 1;
-}
-
-#endif // __UNIT_TEST_COMMON
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_cross_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src1 = NULL;
-arm_float_t * guarded_src2 = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc1 = NULL;
-arm_float_t * thesrc2 = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for especial test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
-
- // test the especial case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialization the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
- memcpy( esp_buf[3], esp_buf[1], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
-
- fprintf ( stderr, "** NTOE: Due to the nature of this test we cannot use an assert - the values may or may not be the same... make sure NAN values are not geenrated by using FILL_FLOAT_ARRAY_\'LIMIT\'. \n" );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- // assert( esp_buf[0][i] == esp_buf[4][i] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( esp_buf[0][i] , esp_buf[4][i], ERROR_MARGIN_LARGE*10 ) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
-
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = 3; // cross has only one form at the moment, the vec3 with 3 components, x, y, and z
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src1 );
- free( guarded_src2 );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_addmat_operation_x.h
- */
-
-#include "./unit_test_common.h"
-#include "../inc/NE10_types.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
-arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "16" components in a matrix
-#define MAX_VEC_COMPONENTS 16
-
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = 1; // determinant is always a scalar value
- const int item_width_p2 = item_width * item_width;
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width_p2 );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width_p2 ], &thedst[ impl-1 ][ i * item_width_p2 ], sizeof(arm_float_t) * item_width_p2 );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width_p2; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width_p2)+pos ] == _output[ ((1-1)*item_width_p2)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width_p2)+pos ] == _output[ ((impl-1)*item_width_p2)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width_p2)+pos ] , _output[ ((impl-1)*item_width_p2)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS_MATRICES )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS_MATRICES )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_dot_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src1 = NULL;
-arm_float_t * guarded_src2 = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc1 = NULL;
-arm_float_t * thesrc2 = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = 1; // dot product is always a scalar value
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_LARGE ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src1 );
- free( guarded_src2 );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_addmat_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_2args_t)(void * dst, unsigned int count);
-arm_func_2args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "16" components in a matrix
-#define MAX_VEC_COMPONENTS 16
-
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ], ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) )
-
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-
-
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode+1; // using the opcode (1=mat2x2, 2=mat3x3, ...)
- const int item_width_p2 = item_width * item_width;
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width_p2 );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width_p2 ], &thedst[ impl-1 ][ i * item_width_p2 ], sizeof(arm_float_t) * item_width_p2 );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width_p2; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width_p2)+pos ] == _output[ ((1-1)*item_width_p2)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width_p2)+pos ] == _output[ ((impl-1)*item_width_p2)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width_p2)+pos ] , _output[ ((impl-1)*item_width_p2)+pos ], ERROR_MARGIN_LARGE ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS_MATRICES )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS_MATRICES )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_addmat_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
-arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "16" components in a matrix
-#define MAX_VEC_COMPONENTS 16
-
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[2], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], tmp_len );
-
- for ( i = 0; i < tmp_len * (opcode+1) * (opcode+1); i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[2]); free(esp_buf[4]);
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
-
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-
-
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode+1; // using the opcode (1=mat2x2, 2=mat3x3, ...)
- const int item_width_p2 = item_width * item_width;
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width_p2 );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width_p2 ], &thedst[ impl-1 ][ i * item_width_p2 ], sizeof(arm_float_t) * item_width_p2 );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width_p2; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width_p2)+pos ] == _output[ ((1-1)*item_width_p2)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width_p2)+pos ] == _output[ ((impl-1)*item_width_p2)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width_p2)+pos ] , _output[ ((impl-1)*item_width_p2)+pos ], ERROR_MARGIN_LARGE ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS_MATRICES )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS_MATRICES )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_len_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_3args_t)(arm_float_t * dst, void * src, unsigned int count);
-arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, 0 );
-
- // actual test
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source files
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = 1; // LEN() is always a scala
-
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ERROR_MARGIN_LARGE ) ) // accept larger errors as we're doing a single step
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_mla_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_5args_t)(void * dst, void * acc, void * src1, void * src2, unsigned int count);
-arm_func_5args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_acc = NULL;
-arm_float_t * guarded_src1 = NULL;
-arm_float_t * guarded_src2 = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * theacc = NULL;
-arm_float_t * thesrc1 = NULL;
-arm_float_t * thesrc2 = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Nine buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[9];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_acc = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length );
- theacc = (arm_float_t*) ( (void*)guarded_acc + 16);
- FILL_FLOAT_ARRAY( theacc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
- FILL_FLOAT_ARRAY( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
- FILL_FLOAT_ARRAY( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(float);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // input 3
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[5] = (arm_float_t*) malloc( inbytes ); // copy of 3nd input
- esp_buf[6] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[2], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[3], esp_buf[0], inbytes );
- memcpy( esp_buf[4], esp_buf[1], inbytes );
- memcpy( esp_buf[5], esp_buf[2], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], esp_buf[2], tmp_len ); // DST == ACC
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4], esp_buf[5], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[6][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: MLA Operation number %d implementation [%d] has failed the DST==ACC test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
- exit( NE10_ERR );
- }
- }
-
- memcpy( esp_buf[3], esp_buf[0], inbytes );
- memcpy( esp_buf[4], esp_buf[1], inbytes );
- memcpy( esp_buf[5], esp_buf[2], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[1] , esp_buf[0], esp_buf[1], esp_buf[2], tmp_len ); // DST == SRC
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4], esp_buf[5], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[1][i] != esp_buf[6][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: MLA Operation number %d implementation [%d] has failed the DST==SRC test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]); free(esp_buf[5]); free(esp_buf[6]);
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc1, thesrc2, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc1, thesrc2, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , theacc, thesrc1, thesrc2, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_LARGE ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_acc );
- free( guarded_src1 );
- free( guarded_src2 );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_mlac_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_5args_t)(void * dst, void * acc, void * src, const void * cst, unsigned int count);
-arm_func_5args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_cst = NULL;
-arm_float_t * guarded_acc = NULL;
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thecst = NULL;
-arm_float_t * theacc = NULL;
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Nine buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[9];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
- GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
- thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
- thecst[0] = (arm_float_t) 1.4f;
- thecst[1] = (arm_float_t) 6.2f;
- thecst[2] = (arm_float_t) 3.3f;
- thecst[3] = (arm_float_t) 2.5f;
-
- guarded_acc = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length );
- theacc = (arm_float_t*) ( (void*)guarded_acc + 16);
- FILL_FLOAT_ARRAY( theacc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(float);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // input 3
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[5] = (arm_float_t*) malloc( inbytes ); // copy of 3nd input
- esp_buf[6] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[2], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[3], esp_buf[0], inbytes );
- memcpy( esp_buf[4], esp_buf[1], inbytes );
- memcpy( esp_buf[5], esp_buf[2], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], esp_buf[2], tmp_len ); // DST == ACC
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4], esp_buf[5], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[6][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: MLAC Operation number %d implementation [%d] has failed the DST==ACC test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
- exit( NE10_ERR );
- }
- }
-
- memcpy( esp_buf[3], esp_buf[0], inbytes );
- memcpy( esp_buf[4], esp_buf[1], inbytes );
- memcpy( esp_buf[5], esp_buf[2], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[1] , esp_buf[0], esp_buf[1], esp_buf[2], tmp_len ); // DST == SRC
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[6] , esp_buf[3], esp_buf[4], esp_buf[5], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[1][i] != esp_buf[6][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: MLAC Operation number %d implementation [%d] has failed the DST==SRC test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n" );
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]); free(esp_buf[5]); free(esp_buf[6]);
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc, thecst, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_acc, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , theacc, thesrc, thecst, 0 );
-
- // actual test
- if ( 1 == opcode )
- { // in this case the const argument is not a pointer but an actual float value
- union fp_bitwise {
- arm_float_t _f;
- unsigned int _i;
- } _icst;
-
- _icst._f = thecst[0];
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , theacc, thesrc, (void*)_icst._i, ARRLEN );
- }
- );
- }
- else
- {
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , theacc, thesrc, thecst, ARRLEN );
- }
- );
- }
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
-
- free( guarded_cst );
- free( guarded_src );
- free( guarded_acc );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_addmat_operation_x.h
- */
-
-#include "./unit_test_common.h"
-#include "../inc/NE10_types.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * cst, void * src, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "16" components in a matrix
-#define MAX_VEC_COMPONENTS 16
-
-arm_float_t * guarded_cst = NULL;
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thecst = NULL;
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + fixed_length );
- thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
- FILL_FLOAT_ARRAY_LIMIT_GT1( thecst, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[3], esp_buf[1], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[1] , thecst, esp_buf[1], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , thecst, esp_buf[3], tmp_len );
-
- for ( i = 0; i < tmp_len * (opcode+1); i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[1][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[1]); free(esp_buf[3]); free(esp_buf[4]);
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, thesrc, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, thesrc, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thecst, thesrc, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode+1; // using the opcode (1=mat2x2, 2=mat3x3, ...)
- const int item_width_p2 = item_width * item_width;
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS_MATRICES )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS_MATRICES )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_cst );
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_normalize_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_3args_t)(void * dst, void * src, unsigned int count);
-arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[2]); free(esp_buf[4]);
-
-
- // sample run
- MEASURE( elapsed,
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, 0 );
-
- // actual test
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source files
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode+1; // 1=vec2, 2=vec3, 3=vec4
-
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ] , ERROR_MARGIN_LARGE ) ) // accept larger errors as we're doing a single step
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_setc_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_3args_t)(void * dst, const void * cst, unsigned int count);
-arm_func_3args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_cst = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thecst = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
- // initialize if not done so
- if ( 0 == done_init )
- {
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
- guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
- GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
- thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
- FILL_FLOAT_ARRAY( thecst, 4 ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thecst, 0 );
-
- // actual test
- if ( 1 == opcode )
- { // in this case the const argument is not a pointer but an actual float value
- union fp_bitwise {
- arm_float_t _f;
- unsigned int _i;
- } _icst;
-
- _icst._f = thecst[0];
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , (void*)_icst._i, ARRLEN );
- }
- );
- }
- else
- {
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thecst, ARRLEN );
- }
- );
- }
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_cst );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_x_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src1 = NULL;
-arm_float_t * guarded_src2 = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc1 = NULL;
-arm_float_t * thesrc2 = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
- FILL_FLOAT_ARRAY( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
- FILL_FLOAT_ARRAY( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
- memcpy( esp_buf[3], esp_buf[1], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
-
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src1 );
- free( guarded_src2 );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_x_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_src1 = NULL;
-arm_float_t * guarded_src2 = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc1 = NULL;
-arm_float_t * thesrc2 = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
- FILL_FLOAT_ARRAY_LIMIT( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
- memcpy( esp_buf[3], esp_buf[1], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
-
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
- );
-
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_LARGE ) ) // accept larger errors
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( 0 == warns )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src1 );
- free( guarded_src2 );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_xc_operation_x.h
- */
-
-#include "./unit_test_common.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src, const void * cst, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "4" components in a vec
-#define MAX_VEC_COMPONENTS 4
-
-arm_float_t * guarded_cst = NULL;
-arm_float_t * guarded_src = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thecst = NULL;
-arm_float_t * thesrc = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_cst = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
- GUARD_ARRAY( guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS );
- thecst = (arm_float_t*) ( (void*)guarded_cst + 16);
- FILL_FLOAT_ARRAY( thecst, 4 ); // random initialization
-
- guarded_src = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc = (arm_float_t*) ( (void*)guarded_src + 16);
- FILL_FLOAT_ARRAY( thesrc, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( MAX_VEC_COMPONENTS * sizeof(arm_float_t) ); // input 2 - constant
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[3] = (arm_float_t*) malloc( MAX_VEC_COMPONENTS * sizeof(arm_float_t) ); // copy of 2nd input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
- memcpy( esp_buf[3], esp_buf[1], MAX_VEC_COMPONENTS * sizeof(arm_float_t) );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
-
-
- for ( i = 0; i < tmp_len * opcode; i++ ) // at this point the two outputs must be identical
- {
- assert ( esp_buf[0][i] == esp_buf[0][i] ); // check for NAN values
- assert ( esp_buf[4][i] == esp_buf[4][i] );
-
- if ( ! EQUALS_FLOAT( esp_buf[0][i] , esp_buf[4][i], ERROR_MARGIN_LARGE*10 ) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
-
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, thecst, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_cst, (2*ARRAY_GUARD_LEN) + sizeof(arm_float_t) * MAX_VEC_COMPONENTS) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc, thecst, 0 );
-
- // actual test
- if ( 1 == opcode )
- { // in this case the const argument is not a pointer but an actual float value
- union fp_bitwise {
- arm_float_t _f;
- unsigned int _i;
- } _icst;
-
- _icst._f = thecst[0];
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, (void*)_icst._i, ARRLEN );
- }
- );
- }
- else
- {
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc, thecst, ARRLEN );
- }
- );
- }
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode; // there's no easy way to guess the actual number of an item's components but using the opcode (1=float, 2=vec2, ...)
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width ], &thedst[ impl-1 ][ i * item_width ], sizeof(arm_float_t) * item_width );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width)+pos ] == _output[ ((1-1)*item_width)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width)+pos ] == _output[ ((impl-1)*item_width)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width)+pos ] , _output[ ((impl-1)*item_width)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_cst );
- free( guarded_src );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/unit_test_addmat_operation_x.h
- */
-
-#include "./unit_test_common.h"
-#include "../inc/NE10_types.h"
-
-// This function signature applies the operations with the format "*c_*_*" (e.g. 'add'c_'float'_'neon')
-typedef arm_result_t (*arm_func_4args_t)(void * dst, void * src1, void * src2, unsigned int count);
-arm_func_4args_t ftbl[ OP_COUNT * IMPL_COUNT ];
-
-
-// this function is implemented in the unit test source files
-// it is meant to initialise the function table defined above.
-extern void init_ftbl();
-
-
-unsigned int i = 0; // loop iterator
-unsigned int max = 0; // number of iterations in each function
-int opcode = -1; // the operation which will be tested (a single unit can have any number of operations/functions)
-int impl = -1; // selects which particular implementation of the chosen operation must run
-int mute = 0; // 0 == print output; 1 == do not print anything;
-
-struct timeval before, after, lapsed, dummy;
-double dt_test_overhead = 0.0;
-double dt_test_sample = 0.0;
-double elapsed = 0.0;
-struct timezone zone;
-
-// there is a max of "16" components in a matrix
-#define MAX_VEC_COMPONENTS 16
-
-arm_float_t * guarded_src1 = NULL;
-arm_float_t * guarded_src2 = NULL;
-arm_float_t * guarded_dst[IMPL_COUNT];
-
-arm_float_t * thesrc1 = NULL;
-arm_float_t * thesrc2 = NULL;
-arm_float_t * thedst[IMPL_COUNT]; // output from different implementations are stored in separate arrays for varification
-int done_init = 0;
-
-// Eight buffers that are used for special test cases such as when the destination and source point to the same address.
-// They may vary in size from one case to another and from one function to another.
-arm_float_t* esp_buf[8];
-
-arm_result_t test_operation()
-{
- const unsigned int fixed_length = ARRLEN * sizeof(arm_float_t) * MAX_VEC_COMPONENTS;
-
- // initialize if not done so
- if ( 0 == done_init )
- {
- guarded_src1 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc1 = (arm_float_t*) ( (void*)guarded_src1 + 16);
- FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc1, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- guarded_src2 = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length );
- thesrc2 = (arm_float_t*) ( (void*)guarded_src2 + 16);
- FILL_FLOAT_ARRAY_LIMIT_GT1( thesrc2, ARRLEN * MAX_VEC_COMPONENTS ); // random initialization
-
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- guarded_dst[i] = (arm_float_t*) malloc( (2*ARRAY_GUARD_LEN) + fixed_length ); // 16 extra bytes at the begining and 16 extra bytes at the end
- GUARD_ARRAY( guarded_dst[i], (2*ARRAY_GUARD_LEN) + fixed_length );
- thedst[i] = (arm_float_t*) ( (void*)guarded_dst[i] + 16);
- }
-
- done_init = 1;
- }
-
- // test the special case where dst == src
- unsigned int tmp_len = 13; // Just an odd number bigger than 8
- unsigned int inbytes = tmp_len * MAX_VEC_COMPONENTS * sizeof(arm_float_t);
- esp_buf[0] = (arm_float_t*) malloc( inbytes ); // input 1
- esp_buf[1] = (arm_float_t*) malloc( inbytes ); // input 2
- esp_buf[2] = (arm_float_t*) malloc( inbytes ); // copy of 1st input
- esp_buf[3] = (arm_float_t*) malloc( inbytes ); // copy of 2nd input
- esp_buf[4] = (arm_float_t*) malloc( inbytes ); // use this as the output buffer
-
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[0], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- FILL_FLOAT_ARRAY_LIMIT( esp_buf[1], tmp_len * MAX_VEC_COMPONENTS ); // initialize the array with random numbers
- memcpy( esp_buf[2], esp_buf[0], inbytes );
- memcpy( esp_buf[3], esp_buf[1], inbytes );
-
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[0] , esp_buf[0], esp_buf[1], tmp_len );
- ftbl [ FTBL_IDX(opcode, impl) ] ( esp_buf[4] , esp_buf[2], esp_buf[3], tmp_len );
-
- for ( i = 0; i < tmp_len * (opcode+1) * (opcode+1); i++ ) // at this point the two outputs must be identical
- {
- if ( esp_buf[0][i] != esp_buf[4][i] )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the dst==src test case. \n", opcode, impl );
- fprintf ( stderr, "\t NOTE: Usually implementation 1=C, 2=ASM/VFP, and 3=ASM/NEON. \n");
- exit( NE10_ERR );
- }
- }
-
- free(esp_buf[0]); free(esp_buf[1]); free(esp_buf[2]); free(esp_buf[3]); free(esp_buf[4]);
-
- // sample run
- MEASURE( dt_test_sample,
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, ARRLEN );
- );
- if ( ! CHECK_ARRAY_GUARD(guarded_dst[ impl -1 ], (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src1, (2*ARRAY_GUARD_LEN) + fixed_length) ||
- ! CHECK_ARRAY_GUARD(guarded_src2, (2*ARRAY_GUARD_LEN) + fixed_length) )
- {
- fprintf ( stderr, "\t FATAL ERROR: Operation number %d implementation [%d] has failed the guard test. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- // this test to make sure passing zero as the length won't cause segmentation faults
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl-1 ] , thesrc1, thesrc2, 0 );
-
- MEASURE( elapsed,
- for ( i = 0; i < max; i++ )
- {
- // call the function
- ftbl [ FTBL_IDX(opcode, impl) ] ( thedst[ impl -1 ] , thesrc1, thesrc2, ARRLEN );
- }
- );
-
- if ( !mute )
- printf( "%02.8f;%013.3f\n", elapsed - dt_test_overhead,
- ( 1.0f * max * ARRLEN / ( elapsed - dt_test_overhead )) );
-
- return NE10_OK;
-}
-
-arm_result_t run_test( int argc, char **argv )
-{
- if ( argc == 2 ) // requesting the number of available operations/routines in this unit
- {
- opcode = atoi ( argv[1] ); // get the command being requested, 0 = return the number of functions in this unit
- if ( opcode == 0 ) return OP_COUNT;
- exit( NE10_ERR );
- } else if ( argc == 4 ) // requesting a particular implementation of one of the operations
- {
- opcode = atoi ( argv[1] );
- if ( opcode <= 0 ) exit( NE10_ERR );
- impl = atoi ( argv[2] );
- if ( impl < 0 ) exit( NE10_ERR ); // impl == 0 means run all and compare the results to verify they produce identical outputs
- max = atoi ( argv[3] );
- if ( max <= 0 ) exit( NE10_ERR );
- } else exit( NE10_ERR );
-
- // initialize the table with NULL
- memset( ftbl, 0, sizeof(ftbl));
-
- // manually initialize the functions which have actual implementations
- init_ftbl(); // this function is implemented in the unit test source file
-
- if ( opcode <= 0 || opcode > OP_COUNT
- || impl < 0 || impl > IMPL_COUNT )
- {
- fprintf ( stderr, "\t WARNING: Operation number %d and/or implementaion number %d are not acceptable values. \n", opcode, impl );
- exit( NE10_ERR );
- }
-
- if ( impl == 0 ) // run all implementations and verify
- {
- // first, make sure all of the implementations do exist
- for ( i = FTBL_IDX(opcode, 1); i <= FTBL_IDX(opcode, IMPL_COUNT); i++ )
- {
- if ( NULL == ftbl[i] )
- {
- fprintf ( stderr, "\t WARNING: One or more implementations of operation number %d were not found. \n", opcode );
- exit( NE10_ERR );
- }
- }
-
- // try all the implementatins here..
- mute = 1; // do not print anything
-
- // opcode remains the same but we iterate through different implementations here..
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- test_operation();
- }
-
- // now verify
- arm_float_t * _output = NULL; // [ IMPL_COUNT * MAX_VEC_COMPONENTS ]; // one for each implementation, c, asm, neon...
- int warns = 0;
- int item_width = opcode+1; // using the opcode (1=mat2x2, 2=mat3x3, ...)
- const int item_width_p2 = item_width * item_width;
- _output = (arm_float_t*) malloc( IMPL_COUNT * sizeof(arm_float_t) * item_width_p2 );
- for ( i = 0; i < ARRLEN; i++ )
- {
- for ( impl= 1; impl <= IMPL_COUNT; impl ++ )
- {
- memcpy ( &_output[ (impl-1) * item_width_p2 ], &thedst[ impl-1 ][ i * item_width_p2 ], sizeof(arm_float_t) * item_width_p2 );
- }
-
- int pos = 0;
- for ( impl = 2; impl <= IMPL_COUNT; impl ++ ) // compare the output from the 2nd, 3rd, 4th, etc. to the first one so start at 2
- {
- for ( pos = 0; pos < item_width_p2; pos++ ) // compare corresponding components of the items
- {
- assert ( _output[ ((1-1)*item_width_p2)+pos ] == _output[ ((1-1)*item_width_p2)+pos ] ); // check for not-a-number
- assert ( _output[ ((impl-1)*item_width_p2)+pos ] == _output[ ((impl-1)*item_width_p2)+pos ] ); // check for not-a-number
-
- if ( ! EQUALS_FLOAT( _output[ ((1-1)*item_width_p2)+pos ] , _output[ ((impl-1)*item_width_p2)+pos ], ERROR_MARGIN_SMALL ) )
- { fprintf( stderr, "\t\t WARNING: In opcode [%d], implementation [1] != implemenation [%d] on item [%d -> %d]\n",
- opcode, impl, i, pos+1 );
- warns++; }
-
- // stop after 10 warnings
- if ( warns >= ACCEPTABLE_WARNS_MATRICES )
- { fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- }
- }
- }
- free( _output ); _output = (arm_float_t *) NULL;
-
- if ( warns < ACCEPTABLE_WARNS_MATRICES )
- {
- return NE10_OK;
- }
-
- fprintf ( stderr, "\t WARNING: One or more mismatching values were found. \n" );
- exit( NE10_ERR );
- }
- else // run a particular implementation
- {
- if ( !mute ) printf( "opcode=%d;impl=%d;%d;%d;", opcode, impl, ARRLEN, max );
-
- // ge the overhead
- MEASURE( dt_test_overhead,
- for ( i = 0 ; i < max; i++ )
- {
- }
- );
-
- test_operation();
- }
-
-
-
- // free any allocated memory...
- free( guarded_src1 );
- free( guarded_src2 );
- for ( i = 0; i<IMPL_COUNT; i++ )
- {
- free( guarded_dst[i] );
- }
-
- return NE10_OK;
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : headers/versionheader.h
- */
-
-/////////////////////////////////////////////////////////
-// version information
-/////////////////////////////////////////////////////////
-
-#define VERSION_MAJOR 0
-#define VERSION_MINOR 9
-#define VERSION_REVISION 10
-
-#define PHASE 1
-#define COPYRIGHT_YEAR 2012
-#define COPYRIGHT_HOLDER "ARM Ltd."
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : headers/versionheader.s
-@
-
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-@ version information
-@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- .equ VERSION_MAJOR, 0
- .equ VERSION_MINOR, 9
- .equ VERSION_REVISION, 10
-
- .equ PHASE, 1
- .equ COPYRIGHT_YEAR, 2012
-
-COPYRIGHT_HOLDER:
- .asciz "ARM Ltd."
* NE10 Library : inc/NE10.h
*/
-/*! \file NE10.h
- \brief All NE10 routines declarations.
-
- The routines that are provided by this library are all declared in this header file.
- */
-
-//#include "../headers/versionheader.h"
-#include <NE10_types.h>
-#include <NE10_c.h>
-#include <NE10_asm.h>
-#include <NE10_neon.h>
-
#ifndef NE10_H
#define NE10_H
extern "C" {
#endif
-///////////////////////////
-// function prototypes:
-///////////////////////////
-
-
-// ## Vector-Constant Arithmetic ##
-
-/*!
- Adds a constant scalar value to all the elements of an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst The constant scalar added to the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*addc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-/*!
- Adds a constant 2D vector to all of the vectors in an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 2D vector added to the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*addc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Adds a constant 3D vector to all of the vectors in an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 3D vector added to the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*addc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Adds a constant 4D vector to all of the vectors in an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 4D vector added to the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*addc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-/*!
- Subtracts a constant scalar from all the elements of an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst The constant scalar subtracted from the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*subc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-/*!
- Subtracts a constant 2D vector from all of the vectors in an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 2D vector subtracted from the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*subc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Subtracts a constant 3D vector from all of the vectors in an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 3D vector subtracted from the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*subc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Subtracts a constant 4D vector from all of the vectors in an input array and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 4D vector subtracted from the input values
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*subc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-/*!
- Subtracts the elements of an input array from a constant scalar and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst The constant scalar to subtract the input values from
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*rsbc_float)(arm_float_t * dst, arm_float_t *src, const arm_float_t cst, unsigned int count);
-/*!
- Subtracts the vectors in an input array from a constant 2D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 2D vector to subtract the input values from
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*rsbc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Subtracts the vectors in an input array from a constant 3D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 3D vector to subtract the input values from
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*rsbc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Subtracts the vectors in an input array from a constant 4D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 4D vector to subtract the input values from
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*rsbc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-/*!
- Multiplies the elements of an input array by a constant scalar and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst The constant scalar to multiply the input values with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mulc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-/*!
- Multiplies the components of 2D vectors in an input array by the components of a constant 2D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 2D vector to multiply the input values with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mulc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Multiplies the components of 3D vectors in an input array by the components of a constant 3D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 3D vector to multiply the input values with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mulc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Multiplies the components of 4D vectors in an input array by the components of a constant 4D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 4D vector to multiply the input values with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mulc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-/*!
- Divides the elements of an input array by a constant scalar and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst The constant scalar to divide the input values by
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*divc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-/*!
- Divides the components of 2D vectors in an input array with the components of a constant 2D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 2D vector to divide the input values by
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*divc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Divides the components of 3D vectors in an input array with the components of a constant 3D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 3D vector to divide the input values by
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*divc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Divides the components of 4D vectors in an input array with the components of a constant 4D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 4D vector to divide the input values by
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*divc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-/*!
- Sets the elements of an input array to a constant scalar and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] cst The constant scalar to set the input values to
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*setc_float)(arm_float_t * dst, const arm_float_t cst, unsigned int count);
-/*!
- Sets the components of 2D vectors in an input array to the components of a constant 2D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] cst Pointer to the 2D vector to set the input values to
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*setc_vec2f)(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] cst Pointer to the 3D vector to set the input values to
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*setc_vec3f)(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
- @param[out] dst Pointer to the destination array
- @param[in] cst Pointer to the 4D vector to set the input values to
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*setc_vec4f)(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-/*!
- Multiplies each entry in the source array (src) by cst, then adds the result to
- the corresponding item of the accumulation array (acc), and stores the result in the destination array.
- @param[out] dst Pointer to the destination array
- @param[in] acc The corresponding elemetn is added to the result of the multiplication
- @param[in] src Pointer to the source array
- @param[in] cst The constant scalar to multiply the input elements with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mlac_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
-/*!
- Multiplies each entry in the source array (src) by the 2D vector cst, then adds the result to
- the corresponding item of the accumulation array (acc), and stores the result in the destination array.
- @param[out] dst Pointer to the destination array
- @param[in] acc The corresponding elemetn is added to the result of the multiplication
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 2D vector to multiply the input vectors with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mlac_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-/*!
- Multiplies each entry in the source array (src) by the 3D vector cst, then adds the result to
- the corresponding item of the accumulation array (acc), and stores the result in the destination array.
- @param[out] dst Pointer to the destination array
- @param[in] acc The corresponding elemetn is added to the result of the multiplication
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 3D vector to multiply the input vectors with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mlac_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-/*!
- Multiplies each entry in the source array (src) by the 4D vector cst, then adds the result to
- the corresponding item of the accumulation array (acc), and stores the result in the destination array.
- @param[out] dst Pointer to the destination array
- @param[in] acc The corresponding elemetn is added to the result of the multiplication
- @param[in] src Pointer to the source array
- @param[in] cst Pointer to the 4D vector to multiply the input vectors with
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*mlac_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-// ## Arithmetic functions over arrays of cst values ##
-
-/*!
- Adds the elements of src1 to the elements of src2 and stores the results in the dst.
- @param[out] dst Pointer to the destination array
- @param[in] src1 The first array to use as the input array
- @param[in] src2 The second array to use as the input array
- @param[in] count The number of items in the two input arrays
- */
-extern arm_result_t (*add_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-/*!
- Subtracts the elements of src2 from the elements of src2 and stores the results in the dst.
- @param[out] dst Pointer to the destination array
- @param[in] src1 The first array to use as the input array
- @param[in] src2 The second array to use as the input array
- @param[in] count The number of items in the two input arrays
- */
-extern arm_result_t (*sub_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-/*!
- Multiplies the elements of src1 by the elements of src2 and stores the results in the dst.
- @param[out] dst Pointer to the destination array
- @param[in] src1 The first array to use as the input array
- @param[in] src2 The second array to use as the input array
- @param[in] count The number of items in the two input arrays
- */
-extern arm_result_t (*mul_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-/*!
- Divides the elements of src1 by the elements of src2 and stores the results in the dst.
- @param[out] dst Pointer to the destination array
- @param[in] src1 The first array to use as the input array
- @param[in] src2 The second array to use as the input array
- @param[in] count The number of items in the two input arrays
- */
-extern arm_result_t (*div_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-/*!
- Performs a multiply and accumulate operation using the corresponding elements in acc, src1, and src2.
- @param[out] dst Pointer to the destination array
- @param[in] acc These elemtns are added to the result of the multiplication operation
- @param[in] src1 The first array to use as the input array
- @param[in] src2 The second array to use as the input array
- @param[in] count The number of items in the two input arrays
- */
-extern arm_result_t (*mla_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-/*!
- Calculates the absolute value of each element in the source array and stores the result in the corresponding entry of the destination array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*abs_float)(arm_float_t * dst, arm_float_t * src, unsigned int count);
-
-
-
-// ## Operations on Vectors ##
-/*!
- Returns length of 2D vectors in corresponding elements of the output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*len_vec2f)(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
-/*!
- Returns length of 3D vectors in corresponding elements of the output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*len_vec3f)(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
-/*!
- Returns length of 4D vectors in corresponding elements of the output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*len_vec4f)(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-/*!
- Normalizes 2D vectors of the input array and stores them in the corresponding elements of the output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*normalize_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-/*!
- Normalizes 3D vectors of the input array and stores them in the corresponding elements of the output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*normalize_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-/*!
- Normalizes 4D vectors of the input array and stores them in the corresponding elements of the output array.
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*normalize_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-
-/*!
- Generates a 2D vector from the absolute values of each of the components of an input vector
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*abs_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-/*!
- Generates a 3D vector from the absolute values of each of the components of an input vector
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*abs_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-/*!
- Generates a 4D vector from the absolute values of each of the components of an input vector
- @param[out] dst Pointer to the destination array
- @param[in] src Pointer to the source array
- @param[in] count The number of items in the input array
- */
-extern arm_result_t (*abs_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-
-/*!
- Multiplies the components of a 2D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vmul_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-/*!
- Multiplies the components of a 3D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vmul_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-/*!
- Multiplies the components of a 4D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vmul_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-/*!
- Divides the components of a 2D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the nominators' source array
- @param[in] src2 Pointer to the denominators' source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vdiv_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-/*!
- Divides the components of a 3D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the nominators' source array
- @param[in] src2 Pointer to the denominators' source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vdiv_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-/*!
- Divides the components of a 4D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the nominators' source array
- @param[in] src2 Pointer to the denominators' source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vdiv_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-/*!
- Performs a multiply and accumulate operation on the components of a 2D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vmla_vec2f)(arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-/*!
- Performs a multiply and accumulate operation on the components of a 3D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vmla_vec3f)(arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-/*!
- Performs a multiply and accumulate operation on the components of a 4D vector with the corresponding components of another
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*vmla_vec4f)(arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-// ## Vector-Vector Algebra ##
-
-/*!
- Vector addition of two 2D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*add_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-/*!
- Vector addition of two 3D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*add_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-/*!
- Vector addition of two 4D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*add_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-/*!
- Vector subtraction of two 2D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*sub_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-/*!
- Vector subtraction of two 3D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*sub_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-/*!
- Vector subtraction of two 4D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*sub_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-/*!
- Dot product of two 2D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*dot_vec2f)(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-/*!
- Dot product of two 3D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*dot_vec3f)(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-/*!
- Dot product of two 4D vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*dot_vec4f)(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-/*!
- Performs a cross product operation on the two input vectors
- @param[out] dst Pointer to the destination array
- @param[in] src1 Pointer to the first source array
- @param[in] src2 Pointer to the second source array
- @param[in] count The number of items in the input arrays
- */
-extern arm_result_t (*cross_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// arm_mat4x4f_t
-extern arm_result_t (*addmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t (*submat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t (*mulmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t (*divmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t (*setmat_4x4f)(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t (*addmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t (*submat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t (*mulmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t (*divmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t (*setmat_3x3f)(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t (*addmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t (*submat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t (*mulmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t (*divmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t (*setmat_2x2f)(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
-
-
-
-// ## Operations on Matrices ##
-
-extern arm_result_t (*detmat_4x4f)(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t (*detmat_3x3f)(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t (*detmat_2x2f)(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t (*invmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t (*invmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t (*invmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t (*transmat_4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t (*identitymat_4x4f)(arm_mat4x4f_t * dst, unsigned int count);
-
-extern arm_result_t (*transmat_3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t (*identitymat_3x3f)(arm_mat3x3f_t * dst, unsigned int count);
-
-extern arm_result_t (*transmat_2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t (*identitymat_2x2f)(arm_mat2x2f_t * dst, unsigned int count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern arm_result_t (*mulcmatvec_cm4x4f_v4f)(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t (*mulcmatvec_cm3x3f_v3f)(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t (*mulcmatvec_cm2x2f_v2f)(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-
-
-// ## Matrix-Matrix Algebra ##
-extern arm_result_t (*multrans_mat4x4f)(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t (*multrans_mat3x3f)(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t (*multrans_mat2x2f)(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
+#include "NE10_types.h"
+#include "NE10_init.h"
+#include "NE10_math.h"
#ifdef __cplusplus
}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : inc/NE10_asm.h
- */
-
-//#include "../headers/versionheader.h"
-
-#ifndef NE10_ASM_H
-#define NE10_ASM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-///////////////////////////
-// function prototypes:
-///////////////////////////
-
-// ## Vector-Constant Arithmetic ##
-
-extern arm_result_t addc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t addc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t addc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t addc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t subc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract cst from the element(s)
-
-
-
-extern arm_result_t rsbc_float_asm(arm_float_t * dst, arm_float_t *src, const arm_float_t cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t *src, const arm_vec2f_t * cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t *src, const arm_vec3f_t * cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t *src, const arm_vec4f_t * cst, unsigned int count); // subtract element(s) from a cst
-
-
-
-extern arm_result_t mulc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t mulc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t mulc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t mulc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t divc_float_asm(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t divc_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t divc_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t divc_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t setc_float_asm(arm_float_t * dst, const arm_float_t cst, unsigned int count);
-extern arm_result_t setc_vec2f_asm(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t setc_vec3f_asm(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t setc_vec4f_asm(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t mlac_float_asm(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t mlac_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t mlac_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t mlac_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-// ## Arithmetic functions over arrays of cst values ##
-extern arm_result_t add_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t sub_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t mul_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t div_float_asm(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t mla_float_asm(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t abs_float_asm(arm_float_t * dst, arm_float_t * src, unsigned int count);
-
-// ## Operations on Vectors ##
-extern arm_result_t len_vec2f_asm(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t len_vec3f_asm(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t len_vec4f_asm(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-extern arm_result_t normalize_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t normalize_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t normalize_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-extern arm_result_t abs_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t abs_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t abs_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-extern arm_result_t vmul_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t vdiv_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t vmla_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-// ## Vector-Vector Algebra ##
-extern arm_result_t add_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t add_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t add_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t sub_vec2f_asm(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t sub_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t sub_vec4f_asm(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t dot_vec2f_asm(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t dot_vec3f_asm(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t dot_vec4f_asm(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t cross_vec3f_asm(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// arm_mat4x4f_t
-extern arm_result_t addmat_4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t submat_4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t mulmat_4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t divmat_4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t setmat_4x4f_asm(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t addmat_3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t submat_3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t mulmat_3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t divmat_3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t setmat_3x3f_asm(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t addmat_2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t submat_2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t mulmat_2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t divmat_2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t setmat_2x2f_asm(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
-
-
-
-// ## Operations on Matrices ##
-
-extern arm_result_t detmat_4x4f_asm(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t detmat_3x3f_asm(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t detmat_2x2f_asm(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t invmat_4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t invmat_3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t invmat_2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t transmat_4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t identitymat_4x4f_asm(arm_mat4x4f_t * dst, unsigned int count);
-
-extern arm_result_t transmat_3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t identitymat_3x3f_asm(arm_mat3x3f_t * dst, unsigned int count);
-
-extern arm_result_t trans_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t identity_mat2x2f_asm(arm_mat2x2f_t * dst, unsigned int count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern arm_result_t mulcmatvec_cm4x4f_v4f_asm(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm3x3f_v3f_asm(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm2x2f_v2f_asm(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-
-
-
-
-// ## Matrix-Matrix Algebra ##
-extern arm_result_t multrans_mat4x4f_asm(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t multrans_mat3x3f_asm(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t multrans_mat2x2f_asm(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : inc/NE10_c.h
- */
-
-//#include "../headers/versionheader.h"
-#include <NE10_types.h>
-
-#ifndef NE10_C_H
-#define NE10_C_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-///////////////////////////
-// function prototypes:
-///////////////////////////
-
-
-// ## Vector-Constant Arithmetic ##
-
-extern arm_result_t addc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t addc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t addc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t addc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t subc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract cst from the element(s)
-
-
-
-extern arm_result_t rsbc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract element(s) from a cst
-
-
-
-extern arm_result_t mulc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t mulc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t mulc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t mulc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t divc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t divc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t divc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t divc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t setc_float_c(arm_float_t * dst, const arm_float_t cst, unsigned int count);
-extern arm_result_t setc_vec2f_c(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t setc_vec3f_c(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t setc_vec4f_c(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t mlac_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t mlac_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t mlac_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t mlac_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-// ## Arithmetic functions over arrays of cst values ##
-extern arm_result_t add_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t sub_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t mul_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t div_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t mla_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t abs_float_c(arm_float_t * dst, arm_float_t * src, unsigned int count);
-
-// ## Operations on Vectors ##
-extern arm_result_t len_vec2f_c(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t len_vec3f_c(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t len_vec4f_c(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-extern arm_result_t normalize_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t normalize_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t normalize_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-extern arm_result_t abs_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t abs_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t abs_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-extern arm_result_t vmul_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t vdiv_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t vmla_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-// ## Vector-Vector Algebra ##
-extern arm_result_t add_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t add_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t add_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t sub_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t sub_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t sub_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t dot_vec2f_c(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t dot_vec3f_c(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t dot_vec4f_c(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t cross_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// arm_mat4x4f_t
-extern arm_result_t addmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t submat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t mulmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t divmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t setmat_4x4f_c(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t addmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t submat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t mulmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t divmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t setmat_3x3f_c(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t addmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t submat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t mulmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t divmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t setmat_2x2f_c(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
-
-
-
-// ## Operations on Matrices ##
-
-extern arm_result_t detmat_4x4f_c(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t detmat_3x3f_c(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t detmat_2x2f_c(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t invmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t invmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t invmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t transmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t identitymat_4x4f_c(arm_mat4x4f_t * dst, unsigned int count);
-
-extern arm_result_t transmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t identitymat_3x3f_c(arm_mat3x3f_t * dst, unsigned int count);
-
-extern arm_result_t transmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t identitymat_2x2f_c(arm_mat2x2f_t * dst, unsigned int count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern arm_result_t mulcmatvec_cm4x4f_v4f_c(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm3x3f_v3f_c(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm2x2f_v2f_c(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-
-
-// ## Matrix-Matrix Algebra ##
-extern arm_result_t multrans_mat4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t multrans_mat3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t multrans_mat2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NE10.h"
+
+#ifndef NE10_init_H
+#define NE10_init_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ This routine returns NE10_OK if the running platform supports NEON, otherwise it returns NE10_ERR
+ */
+extern ne10_result_t NE10_HasNEON();
+
+/*!
+ This routine initializes all the function pointers.
+ */
+extern ne10_result_t NE10_init();
+
+/*!
+ This routine initializes all the math function pointers defined in "NE10_math.h" with pointers to ARM NEON or ARM VFP implementations.
+ */
+extern ne10_result_t NE10_init_math(ne10_int32_t is_NEON_available);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : inc/NE10_math.h
+ */
+
+/*! \file NE10_math.h
+ \brief All NE10 math routines declarations.
+
+ The routines that are provided by this library are all declared in this header file.
+ */
+
+#include <NE10_types.h>
+
+#ifndef NE10_MATH_H
+#define NE10_MATH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///////////////////////////
+// function prototypes:
+///////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+/*!
+ Adds a constant scalar value to all the elements of an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst The constant scalar added to the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*addc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Adds a constant 2D vector to all of the vectors in an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 2D vector added to the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*addc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Adds a constant 3D vector to all of the vectors in an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 3D vector added to the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*addc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Adds a constant 4D vector to all of the vectors in an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 4D vector added to the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*addc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+/*!
+ Subtracts a constant scalar from all the elements of an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst The constant scalar subtracted from the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*subc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Subtracts a constant 2D vector from all of the vectors in an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 2D vector subtracted from the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*subc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Subtracts a constant 3D vector from all of the vectors in an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 3D vector subtracted from the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*subc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Subtracts a constant 4D vector from all of the vectors in an input array and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 4D vector subtracted from the input values
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*subc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+/*!
+ Subtracts the elements of an input array from a constant scalar and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst The constant scalar to subtract the input values from
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*rsbc_float)(ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Subtracts the vectors in an input array from a constant 2D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 2D vector to subtract the input values from
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*rsbc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Subtracts the vectors in an input array from a constant 3D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 3D vector to subtract the input values from
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*rsbc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Subtracts the vectors in an input array from a constant 4D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 4D vector to subtract the input values from
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*rsbc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+/*!
+ Multiplies the elements of an input array by a constant scalar and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst The constant scalar to multiply the input values with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mulc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Multiplies the components of 2D vectors in an input array by the components of a constant 2D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 2D vector to multiply the input values with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mulc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Multiplies the components of 3D vectors in an input array by the components of a constant 3D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 3D vector to multiply the input values with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mulc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Multiplies the components of 4D vectors in an input array by the components of a constant 4D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 4D vector to multiply the input values with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mulc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+/*!
+ Divides the elements of an input array by a constant scalar and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst The constant scalar to divide the input values by
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*divc_float)(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Divides the components of 2D vectors in an input array with the components of a constant 2D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 2D vector to divide the input values by
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*divc_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Divides the components of 3D vectors in an input array with the components of a constant 3D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 3D vector to divide the input values by
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*divc_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Divides the components of 4D vectors in an input array with the components of a constant 4D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 4D vector to divide the input values by
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*divc_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+/*!
+ Sets the elements of an input array to a constant scalar and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] cst The constant scalar to set the input values to
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*setc_float)(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Sets the components of 2D vectors in an input array to the components of a constant 2D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] cst Pointer to the 2D vector to set the input values to
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*setc_vec2f)(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] cst Pointer to the 3D vector to set the input values to
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*setc_vec3f)(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Sets the components of 3D vectors in an input array to the components of a constant 3D vector and stores the results in an output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] cst Pointer to the 4D vector to set the input values to
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*setc_vec4f)(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+/*!
+ Multiplies each entry in the source array (src) by cst, then adds the result to
+ the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+ @param[out] dst Pointer to the destination array
+ @param[in] acc The corresponding elemetn is added to the result of the multiplication
+ @param[in] src Pointer to the source array
+ @param[in] cst The constant scalar to multiply the input elements with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mlac_float)(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+/*!
+ Multiplies each entry in the source array (src) by the 2D vector cst, then adds the result to
+ the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+ @param[out] dst Pointer to the destination array
+ @param[in] acc The corresponding elemetn is added to the result of the multiplication
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 2D vector to multiply the input vectors with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mlac_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+/*!
+ Multiplies each entry in the source array (src) by the 3D vector cst, then adds the result to
+ the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+ @param[out] dst Pointer to the destination array
+ @param[in] acc The corresponding elemetn is added to the result of the multiplication
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 3D vector to multiply the input vectors with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mlac_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+/*!
+ Multiplies each entry in the source array (src) by the 4D vector cst, then adds the result to
+ the corresponding item of the accumulation array (acc), and stores the result in the destination array.
+ @param[out] dst Pointer to the destination array
+ @param[in] acc The corresponding elemetn is added to the result of the multiplication
+ @param[in] src Pointer to the source array
+ @param[in] cst Pointer to the 4D vector to multiply the input vectors with
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*mlac_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+// ## Arithmetic functions over arrays of cst values ##
+
+/*!
+ Adds the elements of src1 to the elements of src2 and stores the results in the dst.
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 The first array to use as the input array
+ @param[in] src2 The second array to use as the input array
+ @param[in] count The number of items in the two input arrays
+ */
+extern ne10_result_t (*add_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+/*!
+ Subtracts the elements of src2 from the elements of src2 and stores the results in the dst.
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 The first array to use as the input array
+ @param[in] src2 The second array to use as the input array
+ @param[in] count The number of items in the two input arrays
+ */
+extern ne10_result_t (*sub_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+/*!
+ Multiplies the elements of src1 by the elements of src2 and stores the results in the dst.
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 The first array to use as the input array
+ @param[in] src2 The second array to use as the input array
+ @param[in] count The number of items in the two input arrays
+ */
+extern ne10_result_t (*mul_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+/*!
+ Divides the elements of src1 by the elements of src2 and stores the results in the dst.
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 The first array to use as the input array
+ @param[in] src2 The second array to use as the input array
+ @param[in] count The number of items in the two input arrays
+ */
+extern ne10_result_t (*div_float)(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+/*!
+ Performs a multiply and accumulate operation using the corresponding elements in acc, src1, and src2.
+ @param[out] dst Pointer to the destination array
+ @param[in] acc These elemtns are added to the result of the multiplication operation
+ @param[in] src1 The first array to use as the input array
+ @param[in] src2 The second array to use as the input array
+ @param[in] count The number of items in the two input arrays
+ */
+extern ne10_result_t (*mla_float)(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+/*!
+ Calculates the absolute value of each element in the source array and stores the result in the corresponding entry of the destination array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*abs_float)(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+
+
+
+// ## Operations on Vectors ##
+/*!
+ Returns length of 2D vectors in corresponding elements of the output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*len_vec2f)(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+/*!
+ Returns length of 3D vectors in corresponding elements of the output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*len_vec3f)(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+/*!
+ Returns length of 4D vectors in corresponding elements of the output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*len_vec4f)(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+/*!
+ Normalizes 2D vectors of the input array and stores them in the corresponding elements of the output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*normalize_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+/*!
+ Normalizes 3D vectors of the input array and stores them in the corresponding elements of the output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*normalize_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+/*!
+ Normalizes 4D vectors of the input array and stores them in the corresponding elements of the output array.
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*normalize_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+
+/*!
+ Generates a 2D vector from the absolute values of each of the components of an input vector
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*abs_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+/*!
+ Generates a 3D vector from the absolute values of each of the components of an input vector
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*abs_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+/*!
+ Generates a 4D vector from the absolute values of each of the components of an input vector
+ @param[out] dst Pointer to the destination array
+ @param[in] src Pointer to the source array
+ @param[in] count The number of items in the input array
+ */
+extern ne10_result_t (*abs_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+
+/*!
+ Multiplies the components of a 2D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vmul_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+/*!
+ Multiplies the components of a 3D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vmul_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+/*!
+ Multiplies the components of a 4D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vmul_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+/*!
+ Divides the components of a 2D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the nominators' source array
+ @param[in] src2 Pointer to the denominators' source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vdiv_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+/*!
+ Divides the components of a 3D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the nominators' source array
+ @param[in] src2 Pointer to the denominators' source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vdiv_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+/*!
+ Divides the components of a 4D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the nominators' source array
+ @param[in] src2 Pointer to the denominators' source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vdiv_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+/*!
+ Performs a multiply and accumulate operation on the components of a 2D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vmla_vec2f)(ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+/*!
+ Performs a multiply and accumulate operation on the components of a 3D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vmla_vec3f)(ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+/*!
+ Performs a multiply and accumulate operation on the components of a 4D vector with the corresponding components of another
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*vmla_vec4f)(ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+// ## Vector-Vector Algebra ##
+
+/*!
+ Vector addition of two 2D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*add_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+/*!
+ Vector addition of two 3D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*add_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+/*!
+ Vector addition of two 4D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*add_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+/*!
+ Vector subtraction of two 2D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*sub_vec2f)(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+/*!
+ Vector subtraction of two 3D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*sub_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+/*!
+ Vector subtraction of two 4D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*sub_vec4f)(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+/*!
+ Dot product of two 2D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*dot_vec2f)(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+/*!
+ Dot product of two 3D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*dot_vec3f)(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+/*!
+ Dot product of two 4D vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*dot_vec4f)(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+/*!
+ Performs a cross product operation on the two input vectors
+ @param[out] dst Pointer to the destination array
+ @param[in] src1 Pointer to the first source array
+ @param[in] src2 Pointer to the second source array
+ @param[in] count The number of items in the input arrays
+ */
+extern ne10_result_t (*cross_vec3f)(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// ne10_mat4x4f_t
+extern ne10_result_t (*addmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*submat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*mulmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*divmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*setmat_4x4f)(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t (*addmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*submat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*mulmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*divmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*setmat_3x3f)(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t (*addmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*submat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*mulmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*divmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*setmat_2x2f)(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+
+
+// ## Operations on Matrices ##
+
+extern ne10_result_t (*detmat_4x4f)(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*detmat_3x3f)(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*detmat_2x2f)(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t (*invmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*invmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*invmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t (*transmat_4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*identitymat_4x4f)(ne10_mat4x4f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t (*transmat_3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*identitymat_3x3f)(ne10_mat3x3f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t (*transmat_2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*identitymat_2x2f)(ne10_mat2x2f_t * dst, ne10_uint32_t count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern ne10_result_t (*mulcmatvec_cm4x4f_v4f)(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*mulcmatvec_cm3x3f_v3f)(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t (*mulcmatvec_cm2x2f_v2f)(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+
+
+// ## Matrix-Matrix Algebra ##
+extern ne10_result_t (*multrans_mat4x4f)(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*multrans_mat3x3f)(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t (*multrans_mat2x2f)(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+
+
+///////////////////////////
+// C function prototypes:
+///////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+extern ne10_result_t addc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t subc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+
+
+
+extern ne10_result_t rsbc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+
+
+
+extern ne10_result_t mulc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t divc_float_c(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t setc_float_c(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec2f_c(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec3f_c(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec4f_c(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t mlac_float_c(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+// ## Arithmetic functions over arrays of cst values ##
+extern ne10_result_t add_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t mul_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t div_float_c(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t mla_float_c(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t abs_float_c(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+
+// ## Operations on Vectors ##
+extern ne10_result_t len_vec2f_c(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t len_vec3f_c(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t len_vec4f_c(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+extern ne10_result_t normalize_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t normalize_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t normalize_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+extern ne10_result_t abs_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t abs_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t abs_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern ne10_result_t vmul_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmul_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmul_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t vdiv_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vdiv_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vdiv_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t vmla_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmla_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmla_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern ne10_result_t add_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t add_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t add_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t sub_vec2f_c(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_vec4f_c(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t dot_vec2f_c(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t dot_vec3f_c(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t dot_vec4f_c(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t cross_vec3f_c(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// ne10_mat4x4f_t
+extern ne10_result_t addmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_4x4f_c(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t addmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_3x3f_c(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t addmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_2x2f_c(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+
+
+// ## Operations on Matrices ##
+
+extern ne10_result_t detmat_4x4f_c(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t detmat_3x3f_c(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t detmat_2x2f_c(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t invmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t invmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t invmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t transmat_4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_4x4f_c(ne10_mat4x4f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t transmat_3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_3x3f_c(ne10_mat3x3f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t transmat_2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_2x2f_c(ne10_mat2x2f_t * dst, ne10_uint32_t count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern ne10_result_t mulcmatvec_cm4x4f_v4f_c(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+extern ne10_result_t mulcmatvec_cm3x3f_v3f_c(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t mulcmatvec_cm2x2f_v2f_c(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+
+
+// ## Matrix-Matrix Algebra ##
+extern ne10_result_t multrans_mat4x4f_c(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t multrans_mat3x3f_c(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t multrans_mat2x2f_c(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+
+
+/////////////////////////////
+// NEON function prototypes:
+/////////////////////////////
+
+
+// ## Vector-Constant Arithmetic ##
+
+extern ne10_result_t addc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t subc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+
+
+
+extern ne10_result_t rsbc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+
+
+
+extern ne10_result_t mulc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t divc_float_neon(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t setc_float_neon(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec2f_neon(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec3f_neon(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec4f_neon(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t mlac_float_neon(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+// ## Arithmetic functions over arrays of cst values ##
+extern ne10_result_t add_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t mul_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t div_float_neon(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t mla_float_neon(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t abs_float_neon(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+
+// ## Operations on Vectors ##
+extern ne10_result_t len_vec2f_neon(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t len_vec3f_neon(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t len_vec4f_neon(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+extern ne10_result_t normalize_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t normalize_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t normalize_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+extern ne10_result_t abs_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t abs_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t abs_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern ne10_result_t vmul_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmul_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmul_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t vdiv_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vdiv_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vdiv_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t vmla_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmla_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmla_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern ne10_result_t add_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t add_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t add_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t sub_vec2f_neon(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_vec4f_neon(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t dot_vec2f_neon(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t dot_vec3f_neon(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t dot_vec4f_neon(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t cross_vec3f_neon(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// ne10_mat4x4f_t
+extern ne10_result_t addmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_4x4f_neon(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t addmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_3x3f_neon(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t addmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_2x2f_neon(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+
+
+// ## Operations on Matrices ##
+
+
+extern ne10_result_t detmat_4x4f_neon(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t detmat_3x3f_neon(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t detmat_2x2f_neon(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t invmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t invmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t invmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t transmat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_4x4f_neon(ne10_mat4x4f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t transmat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_3x3f_neon(ne10_mat3x3f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t transmat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_2x2f_neon(ne10_mat2x2f_t * dst, ne10_uint32_t count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern ne10_result_t mulcmatvec_cm4x4f_v4f_neon(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+extern ne10_result_t mulcmatvec_cm3x3f_v3f_neon(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t mulcmatvec_cm2x2f_v2f_neon(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+
+
+
+
+// ## Matrix-Matrix Algebra ##
+extern ne10_result_t multrans_mat4x4f_neon(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t multrans_mat3x3f_neon(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t multrans_mat2x2f_neon(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+
+
+////////////////////////////
+// VFP function prototypes:
+////////////////////////////
+
+// ## Vector-Constant Arithmetic ##
+
+extern ne10_result_t addc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t addc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t subc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+extern ne10_result_t subc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract cst from the element(s)
+
+
+
+extern ne10_result_t rsbc_float_asm(ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t *src, const ne10_vec2f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t *src, const ne10_vec3f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+extern ne10_result_t rsbc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t *src, const ne10_vec4f_t * cst, ne10_uint32_t count); // subtract element(s) from a cst
+
+
+
+extern ne10_result_t mulc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mulc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t divc_float_asm(ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t divc_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t setc_float_asm(ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec2f_asm(ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec3f_asm(ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t setc_vec4f_asm(ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+extern ne10_result_t mlac_float_asm(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+extern ne10_result_t mlac_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+
+
+
+// ## Arithmetic functions over arrays of cst values ##
+extern ne10_result_t add_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t mul_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t div_float_asm(ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t mla_float_asm(ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+extern ne10_result_t abs_float_asm(ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+
+// ## Operations on Vectors ##
+extern ne10_result_t len_vec2f_asm(ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t len_vec3f_asm(ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t len_vec4f_asm(ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+extern ne10_result_t normalize_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t normalize_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t normalize_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+extern ne10_result_t abs_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+extern ne10_result_t abs_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t abs_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+
+
+// ## SIMD Component-wise Arithmetic on Two Vectors ##
+extern ne10_result_t vmul_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmul_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmul_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t vdiv_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vdiv_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vdiv_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t vmla_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmla_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t vmla_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+// ## Vector-Vector Algebra ##
+extern ne10_result_t add_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t add_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t add_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t sub_vec2f_asm(ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t sub_vec4f_asm(ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t dot_vec2f_asm(ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t dot_vec3f_asm(ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t dot_vec4f_asm(ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+
+
+
+extern ne10_result_t cross_vec3f_asm(ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+
+
+// ## Matrix-Constant Arithmetic ##
+
+// ne10_mat4x4f_t
+extern ne10_result_t addmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_4x4f_asm(ne10_mat4x4f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t addmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_3x3f_asm(ne10_mat3x3f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+extern ne10_result_t addmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t submat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t mulmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t divmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+extern ne10_result_t setmat_2x2f_asm(ne10_mat2x2f_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+
+
+
+// ## Operations on Matrices ##
+
+extern ne10_result_t detmat_4x4f_asm(ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t detmat_3x3f_asm(ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t detmat_2x2f_asm(ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t invmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t invmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t invmat_2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+
+extern ne10_result_t transmat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_4x4f_asm(ne10_mat4x4f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t transmat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+extern ne10_result_t identitymat_3x3f_asm(ne10_mat3x3f_t * dst, ne10_uint32_t count);
+
+extern ne10_result_t trans_mat2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+extern ne10_result_t identity_mat2x2f_asm(ne10_mat2x2f_t * dst, ne10_uint32_t count);
+
+
+
+// ## Matrix-Vector Algebra ##
+extern ne10_result_t mulcmatvec_cm4x4f_v4f_asm(ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+extern ne10_result_t mulcmatvec_cm3x3f_v3f_asm(ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+extern ne10_result_t mulcmatvec_cm2x2f_v2f_asm(ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+
+
+
+
+// ## Matrix-Matrix Algebra ##
+extern ne10_result_t multrans_mat4x4f_asm(ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+extern ne10_result_t multrans_mat3x3f_asm(ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+extern ne10_result_t multrans_mat2x2f_asm(ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : inc/NE10_neon.h
- */
-
-//#include "../headers/versionheader.h"
-
-#ifndef NE10_NEON_H
-#define NE10_NEON_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-///////////////////////////
-// function prototypes:
-///////////////////////////
-
-
-// ## Vector-Constant Arithmetic ##
-
-extern arm_result_t addc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t addc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t addc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t addc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t subc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract cst from the element(s)
-extern arm_result_t subc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract cst from the element(s)
-
-
-
-extern arm_result_t rsbc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); // subtract element(s) from a cst
-extern arm_result_t rsbc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); // subtract element(s) from a cst
-
-
-
-extern arm_result_t mulc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t mulc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t mulc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t mulc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t divc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t divc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t divc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t divc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t setc_float_neon(arm_float_t * dst, const arm_float_t cst, unsigned int count);
-extern arm_result_t setc_vec2f_neon(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t setc_vec3f_neon(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t setc_vec4f_neon(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-extern arm_result_t mlac_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t mlac_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t mlac_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t mlac_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-
-// ## Arithmetic functions over arrays of cst values ##
-extern arm_result_t add_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t sub_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t mul_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t mla_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-extern arm_result_t abs_float_neon(arm_float_t * dst, arm_float_t * src, unsigned int count);
-
-// ## Operations on Vectors ##
-extern arm_result_t len_vec2f_neon(arm_float_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t len_vec3f_neon(arm_float_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t len_vec4f_neon(arm_float_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-extern arm_result_t normalize_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t normalize_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t normalize_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-extern arm_result_t abs_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t abs_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t abs_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-
-
-// ## SIMD Component-wise Arithmetic on Two Vectors ##
-extern arm_result_t vmul_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t vdiv_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t vmla_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmla_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-// ## Vector-Vector Algebra ##
-extern arm_result_t add_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t add_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t add_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t sub_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t sub_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t sub_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t dot_vec2f_neon(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t dot_vec3f_neon(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t dot_vec4f_neon(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-
-
-extern arm_result_t cross_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-
-
-// ## Matrix-Constant Arithmetic ##
-
-// arm_mat4x4f_t
-extern arm_result_t addmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t submat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t mulmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t divmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t setmat_4x4f_neon(arm_mat4x4f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t addmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t submat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t mulmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t divmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t setmat_3x3f_neon(arm_mat3x3f_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t addmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t submat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t mulmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t divmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t setmat_2x2f_neon(arm_mat2x2f_t * dst, const arm_float_t cst, unsigned int count);
-
-
-
-// ## Operations on Matrices ##
-
-
-extern arm_result_t detmat_4x4f_neon(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t detmat_3x3f_neon(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t detmat_2x2f_neon(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t invmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t invmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t invmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t transmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t identitymat_4x4f_neon(arm_mat4x4f_t * dst, unsigned int count);
-
-extern arm_result_t transmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t identitymat_3x3f_neon(arm_mat3x3f_t * dst, unsigned int count);
-
-extern arm_result_t transmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t identitymat_2x2f_neon(arm_mat2x2f_t * dst, unsigned int count);
-
-
-
-// ## Matrix-Vector Algebra ##
-extern arm_result_t mulcmatvec_cm4x4f_v4f_neon(arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm3x3f_v3f_neon(arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm2x2f_v2f_neon(arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-
-
-
-
-// ## Matrix-Matrix Algebra ##
-extern arm_result_t multrans_mat4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t multrans_mat3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t multrans_mat2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
/////////////////////////////////////////////////////////
// some external definitions to be exposed to the users
/////////////////////////////////////////////////////////
-typedef float arm_float_t; // a single float value
-typedef int arm_result_t; // resulting [error-]code
+
+typedef signed char ne10_int8_t;
+typedef unsigned char ne10_uint8_t;
+typedef signed short ne10_int16_t;
+typedef unsigned short ne10_uint16_t;
+typedef signed int ne10_int32_t;
+typedef unsigned int ne10_uint32_t;
+typedef signed long long int ne10_int64_t;
+typedef unsigned long long int ne10_uint64_t;
+typedef float ne10_float32_t;
+typedef double ne10_float64_t;
+typedef int ne10_result_t; // resulting [error-]code
typedef struct
{
- float x;
- float y;
-} arm_vec2f_t; // a 2-tuple of float values
+ ne10_float32_t x;
+ ne10_float32_t y;
+} ne10_vec2f_t; // a 2-tuple of ne10_float32_t values
typedef struct
{
- float x;
- float y;
- float z;
-} arm_vec3f_t; // a 3-tuple of float values
+ ne10_float32_t x;
+ ne10_float32_t y;
+ ne10_float32_t z;
+} ne10_vec3f_t; // a 3-tuple of ne10_float32_t values
typedef struct
{
- float x;
- float y;
- float z;
- float w;
-} arm_vec4f_t; // a 4-tuple of float values
+ ne10_float32_t x;
+ ne10_float32_t y;
+ ne10_float32_t z;
+ ne10_float32_t w;
+} ne10_vec4f_t; // a 4-tuple of ne10_float32_t values
-typedef struct { float r1; float r2; } __attribute__((packed)) arm_mat_row2f;
+typedef struct { ne10_float32_t r1; ne10_float32_t r2; } __attribute__((packed)) ne10_mat_row2f;
typedef struct
{
- arm_mat_row2f c1;
- arm_mat_row2f c2;
+ ne10_mat_row2f c1;
+ ne10_mat_row2f c2;
-} __attribute__((packed)) arm_mat2x2f_t; // a 2x2 matrix
+} __attribute__((packed)) ne10_mat2x2f_t; // a 2x2 matrix
-static inline void createColumnMajorMatrix2x2( arm_mat2x2f_t * outMat, arm_float_t m11, arm_float_t m21, arm_float_t m12, arm_float_t m22)
+static inline void createColumnMajorMatrix2x2( ne10_mat2x2f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m12, ne10_float32_t m22)
{
assert( NULL != outMat );
}
-typedef struct { float r1; float r2; float r3; } __attribute__((packed)) arm_mat_row3f;
+typedef struct { ne10_float32_t r1; ne10_float32_t r2; ne10_float32_t r3; } __attribute__((packed)) ne10_mat_row3f;
typedef struct
{
- arm_mat_row3f c1;
- arm_mat_row3f c2;
- arm_mat_row3f c3;
+ ne10_mat_row3f c1;
+ ne10_mat_row3f c2;
+ ne10_mat_row3f c3;
-} __attribute__((packed)) arm_mat3x3f_t; // a 3x3 matrix
+} __attribute__((packed)) ne10_mat3x3f_t; // a 3x3 matrix
-static inline void createColumnMajorMatrix3x3( arm_mat3x3f_t * outMat, arm_float_t m11, arm_float_t m21, arm_float_t m31,
- arm_float_t m12, arm_float_t m22, arm_float_t m32,
- arm_float_t m13, arm_float_t m23, arm_float_t m33)
+static inline void createColumnMajorMatrix3x3( ne10_mat3x3f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31,
+ ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32,
+ ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33)
{
assert( NULL != outMat );
}
-typedef struct { float r1; float r2; float r3; float r4; } __attribute__((packed)) arm_mat_row4f;
+typedef struct { ne10_float32_t r1; ne10_float32_t r2; ne10_float32_t r3; ne10_float32_t r4; } __attribute__((packed)) ne10_mat_row4f;
typedef struct
{
- arm_mat_row4f c1;
- arm_mat_row4f c2;
- arm_mat_row4f c3;
- arm_mat_row4f c4;
+ ne10_mat_row4f c1;
+ ne10_mat_row4f c2;
+ ne10_mat_row4f c3;
+ ne10_mat_row4f c4;
-} __attribute__((packed)) arm_mat4x4f_t; // a 4x4 matrix
+} __attribute__((packed)) ne10_mat4x4f_t; // a 4x4 matrix
-static inline void createColumnMajorMatrix4x4( arm_mat4x4f_t * outMat, arm_float_t m11, arm_float_t m21, arm_float_t m31, arm_float_t m41,
- arm_float_t m12, arm_float_t m22, arm_float_t m32, arm_float_t m42,
- arm_float_t m13, arm_float_t m23, arm_float_t m33, arm_float_t m43,
- arm_float_t m14, arm_float_t m24, arm_float_t m34, arm_float_t m44)
+static inline void createColumnMajorMatrix4x4( ne10_mat4x4f_t * outMat, ne10_float32_t m11, ne10_float32_t m21, ne10_float32_t m31, ne10_float32_t m41,
+ ne10_float32_t m12, ne10_float32_t m22, ne10_float32_t m32, ne10_float32_t m42,
+ ne10_float32_t m13, ne10_float32_t m23, ne10_float32_t m33, ne10_float32_t m43,
+ ne10_float32_t m14, ne10_float32_t m24, ne10_float32_t m34, ne10_float32_t m44)
{
assert( NULL != outMat );
--- /dev/null
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Define files.
+set(NE10_INIT_SRCS ${PROJECT_SOURCE_DIR}/modules/NE10_init.c)
+set(NE10_C_SRCS )
+set(NE10_INTRINSIC_SRCS )
+set(NE10_NEON_SRCS )
+
+if(NE10_ENABLE_MATH)
+ #enable NE10_init_math
+ add_definitions(-DNE10_ENABLE_MATH)
+ # Add math C files.
+ set(NE10_MATH_C_SRCS
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_abs.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_addc.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_add.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_divc.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_div.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_len.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mlac.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mla.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mulc.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mul.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_normalize.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_rsbc.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_setc.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_subc.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_sub.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_dot.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_cross.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_addmat.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_submat.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mulmat.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mulcmatvec.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_detmat.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_invmat.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_transmat.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_identitymat.c
+ )
+
+ # Add math intrinsic NEON files.
+ set(NE10_MATH_INTRINSIC_SRCS
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_addc.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_divc.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mlac.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mulc.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_rsbc.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_setc.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_subc.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_addmat.neon.c
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_submat.neon.c
+ )
+
+ # Tell CMake these files need to be compiled with "-mfpu=neon"
+ foreach(intrinsic_file ${NE10_MATH_INTRINSIC_SRCS})
+ set_source_files_properties(${intrinsic_file} PROPERTIES COMPILE_FLAGS "-mfpu=neon" )
+ endforeach(intrinsic_file)
+
+ # Add math NEON files.
+ set(NE10_MATH_NEON_SRCS
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_abs.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_add.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_div.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_len.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mla.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mul.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_normalize.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_sub.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_dot.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_cross.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mulmat.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_mulcmatvec.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_detmat.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_invmat.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_transmat.neon.s
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_identitymat.neon.s
+ )
+
+ # Tell CMake these files need to go to the C compiler
+ set(FLAGS "-mfpu=neon -Wa,-I${PROJECT_SOURCE_DIR}/inc -Wa,-I${PROJECT_SOURCE_DIR}/common -Wa,-I${PROJECT_SOURCE_DIR}/modules/math" )
+ foreach(neon_file ${NE10_MATH_NEON_SRCS})
+ set_property (SOURCE ${neon_file} PROPERTY LANGUAGE C)
+ set_source_files_properties(
+ ${neon_file} PROPERTIES COMPILE_FLAGS
+ ${FLAGS}
+ )
+ endforeach(neon_file)
+
+ # Add math init files.
+ set(NE10_MATH_INIT_SRCS
+ ${PROJECT_SOURCE_DIR}/modules/math/NE10_init_math.c
+ )
+
+ # Add math files
+ set(NE10_INIT_SRCS ${NE10_INIT_SRCS} ${NE10_MATH_INIT_SRCS})
+ set(NE10_C_SRCS ${NE10_C_SRCS} ${NE10_MATH_C_SRCS})
+ set(NE10_INTRINSIC_SRCS ${NE10_INTRINSIC_SRCS} ${NE10_MATH_INTRINSIC_SRCS})
+ set(NE10_NEON_SRCS ${NE10_NEON_SRCS} ${NE10_MATH_NEON_SRCS})
+endif()
+
+include_directories (
+ ${PROJECT_SOURCE_DIR}/inc
+ ${PROJECT_SOURCE_DIR}/common
+)
+
+if(NE10_BUILD_STATIC)
+ add_library( NE10 STATIC
+ ${NE10_C_SRCS}
+ ${NE10_INTRINSIC_SRCS}
+ ${NE10_NEON_SRCS}
+ ${NE10_INIT_SRCS}
+ )
+ set_target_properties(NE10 PROPERTIES
+ CLEAN_DIRECT_OUTPUT 1
+ VERSION ${NE10_VERSION}
+ )
+endif()
+
+if(NE10_BUILD_SHARED)
+
+ add_library( NE10_shared SHARED
+ ${NE10_C_SRCS}
+ ${NE10_INTRINSIC_SRCS}
+ ${NE10_NEON_SRCS}
+ ${NE10_INIT_SRCS}
+ )
+
+ set_target_properties(NE10_shared PROPERTIES
+ OUTPUT_NAME "NE10"
+ CLEAN_DIRECT_OUTPUT 1
+ VERSION ${NE10_VERSION}
+ )
+
+ add_library( NE10_test SHARED
+ ${NE10_C_SRCS}
+ ${NE10_INTRINSIC_SRCS}
+ ${NE10_NEON_SRCS}
+ ${NE10_INIT_SRCS}
+ )
+
+ set_target_properties(NE10_test PROPERTIES
+ OUTPUT_NAME "NE10_test"
+ CLEAN_DIRECT_OUTPUT 1
+ VERSION ${NE10_VERSION}
+ )
+
+endif()
+
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NE10.h"
+
+#include <stdio.h>
+
+#define CPUINFO_BUFFER_SIZE (1024*4)
+
+// This local variable indicates whether or not the running platform supports ARM NEON
+ne10_result_t is_NEON_available = NE10_ERR;
+
+ne10_result_t NE10_HasNEON()
+{
+ return is_NEON_available;
+}
+
+ne10_result_t NE10_init()
+{
+ FILE* infofile = NULL; // To open the file /proc/cpuinfo
+ ne10_int8_t cpuinfo[CPUINFO_BUFFER_SIZE]; // The buffer to read in the string
+ ne10_uint32_t bytes = 0; // Numbers of bytes read from the file
+ ne10_int32_t i = 0; // Temporary loop counter
+
+ memset (cpuinfo, 0, CPUINFO_BUFFER_SIZE);
+ infofile = fopen ("/proc/cpuinfo", "r");
+ bytes = fread (cpuinfo, 1, sizeof (cpuinfo), infofile);
+ fclose (infofile);
+
+ if (0 == bytes || CPUINFO_BUFFER_SIZE == bytes)
+ {
+ fprintf (stderr, "ERROR: Couldn't read the file \"/proc/cpuinfo\". NE10_init() failed.\n");
+ return NE10_ERR;
+ }
+
+ while ('\0' != cpuinfo[i]) cpuinfo[i++] = (ne10_int8_t) tolower (cpuinfo[i]);
+
+ if (0 != strstr (cpuinfo, "neon"))
+ {
+ is_NEON_available = NE10_OK;
+ }
+
+#if defined (NE10_ENABLE_MATH)
+ NE10_init_math (is_NEON_available);
+#endif
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_abs.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global abs_float_asm
+ .thumb
+ .thumb_func
+
+abs_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t abs_float(arm_float_t * dst,
+ @ arm_float_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndFloat
+ mov r3, #0
+ vmov s2, r3
+
+.LoopBeginFloat:
+ vldr s1, [r1] @ Load s1 = src[i]
+ add r1, r1, #4 @ move to the next item
+ vabs.f32 s1, s1 @ get the absolute value; s1 = abs(s1 - 0)
+ vstr s1, [r0] @ Store it back into the main memory; dst[i] = s1
+ add r0, r0, #4 @ move to the next entry
+ subs r2, r2, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_abs.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+#include <math.h>
+
+ne10_result_t abs_float_c (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count)
+{
+ NE10_ABS_OPERATION_X_C
+ (
+ dst[itr] = fabs (src[itr]);
+ );
+}
+
+ne10_result_t abs_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count)
+{
+ NE10_ABS_OPERATION_X_C
+ (
+ dst[ itr ].x = fabs (src[ itr ].x);
+ dst[ itr ].y = fabs (src[ itr ].y);
+ );
+}
+
+ne10_result_t abs_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count)
+{
+ NE10_ABS_OPERATION_X_C
+ (
+ dst[ itr ].x = fabs (src[ itr ].x);
+ dst[ itr ].y = fabs (src[ itr ].y);
+ dst[ itr ].z = fabs (src[ itr ].z);
+ );
+}
+
+ne10_result_t abs_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count)
+{
+ NE10_ABS_OPERATION_X_C
+ (
+ dst[ itr ].x = fabs (src[ itr ].x);
+ dst[ itr ].y = fabs (src[ itr ].y);
+ dst[ itr ].z = fabs (src[ itr ].z);
+ dst[ itr ].w = fabs (src[ itr ].w);
+ );
+}
+
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_abs.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global abs_float_neon
+ .thumb
+ .thumb_func
+
+abs_float_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t abs_float(arm_float_t * dst,
+ @ arm_float_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
+
+ cbz r3, .L_check_mainloop_float
+
+.L_residualloop_float:
+ @ process the residual items in the input array
+ vld1.f32 d0[0], [r1]! @ Fill in d0 = { V.x, 0 };
+
+ subs r3, r3, #1
+
+ @ absolute values
+ vabs.f32 d0, d0
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_residualloop_float
+
+.L_check_mainloop_float:
+ cbz r2, .L_return_float
+
+ @ load the current set of values
+ vld1.32 {q0}, [r1]! @ for current set
+
+.L_mainloop_float:
+ @ absolute values of the current set
+ vabs.f32 q3, q0 @ q3 = abs( q0 )
+
+ @ store the result for the current set
+ vst1.32 {d6,d7}, [r0]!
+
+ subs r2, r2, #1
+
+ @ load the next set
+ vld1.32 {q0}, [r1]!
+
+ bgt .L_mainloop_float @ loop if r2 > 0, if we have another 4 floats
+
+.L_return_float:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global abs_vec2f_neon
+ .thumb
+ .thumb_func
+
+abs_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t abs_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
+
+ cbz r3, .L_check_mainloop_vec2
+
+.L_residualloop_vec2:
+ @ process the residual items in the input array
+ vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
+
+ subs r3, r3, #1
+
+ @ absolute values
+ vabs.f32 d0, d0
+
+ vst1.32 {d0}, [r0]!
+
+ bgt .L_residualloop_vec2
+
+.L_check_mainloop_vec2:
+ cbz r2, .L_return_vec2
+
+ @ load the current set of values
+ vld2.32 {q0-q1}, [r1]! @ for current set
+
+.L_mainloop_vec2:
+ @ absolute values of the current set
+ vabs.f32 q3, q0 @ q3 = abs( q0 )
+ vabs.f32 q4, q1 @ q4 = abs( q1 )
+
+ @ store the result for the current set
+ vst2.32 {d6,d7,d8,d9}, [r0]!
+
+ subs r2, r2, #1
+
+ @ load the next set
+ vld2.32 {q0-q1}, [r1]!
+
+ bgt .L_mainloop_vec2 @ loop if r2 > 0, if we have another 4 vec2s
+
+.L_return_vec2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+ .align 4
+ .global abs_vec3f_neon
+ .thumb
+ .thumb_func
+abs_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t abs_vec3f(arm_vec3t_t * dst,
+ @ arm_vec3f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
+
+ cbz r3, .L_check_mainloop_vec3
+
+.L_residualloop_vec3:
+ @ process the residual items in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, -, -, - };
+ @ q1 = { V.y, -, -, - };
+ @ q2 = { V.z, -, -, - };
+ subs r3, r3, #1
+
+ @ absolute values
+ vabs.f32 d0, d0
+ vabs.f32 d2, d2
+ vabs.f32 d4, d4
+
+ vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
+
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r2, .L_return_vec3
+
+ @ load the current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]! @ for current set
+
+.L_mainloop_vec3:
+ @ absolute values of the current set
+ vabs.f32 q5, q0
+ vabs.f32 q6, q1
+ vabs.f32 q7, q2
+
+ @ store the result for the current set
+ vst3.32 {d10, d12, d14}, [r0]!
+ vst3.32 {d11, d13, d15}, [r0]!
+
+ subs r2, r2, #1
+
+ @ load the next set
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]! @ for next set
+
+ bgt .L_mainloop_vec3 @ loop if r2 > 0, if we have another 4 vec3s
+
+.L_return_vec3:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global abs_vec4f_neon
+ .thumb
+ .thumb_func
+abs_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t abs_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
+ asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
+
+ cbz r3, .L_check_mainloop_vec4
+
+.L_residualloop_vec4:
+ @ process the residual items in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, V.y, V.z, V.w };
+ subs r3, r3, #1
+
+ @ absolute values
+ vabs.f32 q0, q0
+
+ vst1.32 {d0, d1}, [r0]!
+
+ bgt .L_residualloop_vec4
+
+.L_check_mainloop_vec4:
+ cbz r2, .L_return_vec4
+
+ @ load the current set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]! @ for current set
+
+.L_mainloop_vec4:
+ @ absolute values of the current set
+ vabs.f32 q10, q0
+ vabs.f32 q11, q1
+ vabs.f32 q12, q2
+ vabs.f32 q13, q3
+
+ @ store the result for the current set
+ vst4.32 {d20, d22, d24, d26}, [r0]!
+ vst4.32 {d21, d23, d25, d27}, [r0]!
+
+ subs r2, r2, #1
+
+ @ load the next set
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]! @ for next set
+
+ bgt .L_mainloop_vec4 @ loop if r2 > 0, if we have another 4 vec4s
+
+.L_return_vec4:
+ @ return
+ mov r0, #0
+ bx lr
+
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_add.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global add_float_asm
+ .thumb
+ .thumb_func
+
+add_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t add_float(arm_vec2f_t * dst,
+ @ arm_float_t * src1, const arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+ @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+ @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r3, .LoopEndFloat
+
+.LoopBeginFloat:
+ vldr s1, [r1] @ Load s1 = src1[i]
+ add r1, r1, #4 @ move to the next entry
+ vldr s2, [r2] @ Load s2 = src2[i]
+ add r2, r2, #4 @ next entry
+ vadd.f32 s10, s1, s2 @ s10 = src1[i] * src2[i]
+ vstr s10, [r0] @ Store the result back into the main memory
+ add r0, r0, #4 @ next entry in the dst
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_add.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t add_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ] = src1[ itr ] + src2[ itr ];
+ );
+}
+
+ne10_result_t add_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x + src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y + src2[ itr ].y;
+ );
+}
+
+ne10_result_t add_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x + src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y + src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z + src2[ itr ].z;
+ );
+}
+
+ne10_result_t add_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x + src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y + src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z + src2[ itr ].z;
+ dst[ itr ].w = src1[ itr ].w + src2[ itr ].w;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_add.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global add_float_neon
+ .thumb
+ .thumb_func
+
+add_float_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t add_float(arm_float_t * dst,
+ @ arm_float_t * src1,
+ @ arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_float
+
+.L_residualloop_float:
+ @ process the residual items in the input array
+ vld1.f32 d0[0], [r1]! @ Fill in d0[0]
+ vld1.f32 d1[0], [r2]! @ Fill in d1[0]
+
+ subs r4, r4, #1
+
+ @ values
+ vadd.f32 d0, d0, d1
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_residualloop_float
+
+.L_check_mainloop_float:
+ cbz r3, .L_return_float
+
+
+ @ load the current set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]! @ for current set
+
+.L_mainloop_float:
+ @ calculate values for current set
+ vadd.f32 q3, q0, q1 @ q3 = q0 + q1
+
+ @ store the result for current set
+ vst1.32 {d6,d7}, [r0]!
+
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+
+ bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
+
+.L_return_float:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global add_vec2f_neon
+ .thumb
+ .thumb_func
+
+add_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t add_float(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src1,
+ @ arm_vec2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec2
+
+.L_residualloop_vec2:
+ @ process the residual items in the input array
+ vld1.f32 d0, [r1]!
+ vld1.f32 d1, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vadd.f32 d0, d0, d1
+
+ vst1.32 {d0}, [r0]!
+ bgt .L_residualloop_vec2
+
+.L_check_mainloop_vec2:
+ cbz r3, .L_return_vec2
+
+ @ load the current set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+
+.L_mainloop_vec2:
+ @ calculate values for current set
+ vadd.f32 q8, q0, q2
+ vadd.f32 q9, q1, q3
+
+ @ store the result for current set
+ vst2.32 {d16,d17,d18,d19}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+
+ bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_return_vec2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global add_vec3f_neon
+ .thumb
+ .thumb_func
+add_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t add_float(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec3
+
+.L_residualloop_vec3:
+ @ process the residual items in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+
+ subs r4, r4, #1
+
+ @ calculate values for
+ vadd.f32 d0, d0, d1
+ vadd.f32 d2, d2, d3
+ vadd.f32 d4, d4, d5
+
+ vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
+
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r3, .L_return_vec3
+
+ @ load current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d18, d20, d22}, [r2]!
+ vld3.32 {d19, d21, d23}, [r2]!
+
+.L_mainloop_vec3:
+ @ calculate values for current set
+ vadd.f32 q12, q0, q9
+ vadd.f32 q13, q1, q10
+ vadd.f32 q14, q2, q11
+
+ @ store the result for current set
+ vst3.32 {d24, d26, d28}, [r0]!
+ vst3.32 {d25, d27, d29}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d18, d20, d22}, [r2]!
+ vld3.32 {d19, d21, d23}, [r2]!
+
+ bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_return_vec3:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global add_vec4f_neon
+ .thumb
+ .thumb_func
+add_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t add_float(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src1,
+ @ arm_vec4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec4
+
+.L_residualloop_vec4:
+ @ process the last few items left in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, V1.y, V1.z, V1.w };
+ vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
+ @ q1 = { V2.x, V2.y, V2.z, V2.w };
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vadd.f32 q0, q0, q1
+
+ vst1.32 {d0, d1}, [r0]!
+
+ bgt .L_residualloop_vec4
+
+.L_check_mainloop_vec4:
+ cbz r3, .L_return_vec4
+
+ @ load the current set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+.L_mainloop_vec4:
+ @ calculate values for the current set
+ vadd.f32 q12, q0, q8
+ vadd.f32 q13, q1, q9
+ vadd.f32 q14, q2, q10
+ vadd.f32 q15, q3, q11
+
+ @ store the result for the current set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+ bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_return_vec4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_addc.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global addc_float_asm
+ .thumb
+ .thumb_func
+
+addc_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t addc_float(arm_vec2f_t * dst,
+ @ arm_float_t * src, const arm_float_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndFloat
+ mov r5, #0
+
+.LoopBeginFloat:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i]
+ vmov s3, r2 @ Get cst into register s3
+ vadd.f32 s10, s1, s3 @ s10 = src[i] + cst
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the result back into the main memory
+ add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global addc_vec2f_asm
+ .thumb
+ .thumb_func
+
+addc_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t addc_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec2F
+ mov r5, #0
+
+.LoopBeginVec2F:
+
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x and src[i].y
+ vldr s2, [r6, #4]
+ vldr s3, [r2, #0] @ Load cst->x and cst->y
+ vldr s4, [r2, #4]
+ vadd.f32 s10, s1, s3 @ s10 = src[i].x + cst->x
+ vadd.f32 s11, s2, s4 @ s11 = src[i].y + cst->y
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global addc_vec3f_asm
+ .thumb
+ .thumb_func
+
+addc_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t addc_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec3F
+ mov r5, #0
+
+.LoopBeginVec3F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
+ vldr s5, [r2, #4]
+ vldr s6, [r2, #8]
+ vadd.f32 s10, s1, s4 @ s10 = src[i].x + cst->x
+ vadd.f32 s11, s2, s5 @ s11 = src[i].y + cst->y
+ vadd.f32 s12, s3, s6 @ s12 = src[i].z + cst->z
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global addc_vec4f_asm
+ .thumb
+ .thumb_func
+
+addc_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t addc_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec4F
+ mov r5, #0
+
+.LoopBeginVec4F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r6, #12]
+ vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
+ vldr s6, [r2, #4]
+ vldr s7, [r2, #8]
+ vldr s8, [r2, #12]
+ vadd.f32 s10, s1, s5 @ s10 = src[i].x + cst->x
+ vadd.f32 s11, s2, s6 @ s11 = src[i].y + cst->y
+ vadd.f32 s12, s3, s7 @ s12 = src[i].z + cst->z
+ vadd.f32 s13, s4, s8 @ s13 = src[i].w + cst->w
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ vstr s13, [r7, #12]
+ add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_addc.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t addc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ] = src[ itr ] + cst;
+ );
+}
+
+ne10_result_t addc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x + cst->x;
+ dst[ itr ].y = src[ itr ].y + cst->y;
+ );
+}
+
+ne10_result_t addc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x + cst->x;
+ dst[ itr ].y = src[ itr ].y + cst->y;
+ dst[ itr ].z = src[ itr ].z + cst->z;
+ );
+}
+
+ne10_result_t addc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x + cst->x;
+ dst[ itr ].y = src[ itr ].y + cst->y;
+ dst[ itr ].z = src[ itr ].z + cst->z;
+ dst[ itr ].w = src[ itr ].w + cst->w;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_addc.neon.c
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "NE10.h"
+#include "macros.h"
+
+
+ne10_result_t addc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_FLOAT_NEON
+ (
+ n_dst = vaddq_f32 (n_src , n_cst);
+ ,
+ n_tmp_src = vadd_f32 (n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t addc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC2F_NEON
+ (
+ n_dst = vaddq_f32 (n_src , n_cst);
+ ,
+ n_tmp_src = vadd_f32 (n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t addc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC3F_NEON
+ (
+ n_dst1 = vaddq_f32 (n_src1 , n_cst1);
+ n_dst2 = vaddq_f32 (n_src2 , n_cst2);
+ n_dst3 = vaddq_f32 (n_src3 , n_cst3);
+ ,
+ n_tmp_src.val[0] = vadd_f32 (n_tmp_src.val[0], n_tmp_cst.val[0]); /* the X lane */
+ n_tmp_src.val[1] = vadd_f32 (n_tmp_src.val[1], n_tmp_cst.val[1]); /* the Y lane */
+ n_tmp_src.val[2] = vadd_f32 (n_tmp_src.val[2], n_tmp_cst.val[2]); /* the Z lane */
+ );
+}
+
+ne10_result_t addc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC4F_NEON
+ (
+ n_dst = vaddq_f32 (n_src , n_cst);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_addmat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_addmat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t addmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = src1[ itr ].c1.r1 + src2[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src1[ itr ].c1.r2 + src2[ itr ].c1.r2;
+
+ dst[ itr ].c2.r1 = src1[ itr ].c2.r1 + src2[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = src1[ itr ].c2.r2 + src2[ itr ].c2.r2;
+ );
+}
+
+ne10_result_t addmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = src1[ itr ].c1.r1 + src2[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src1[ itr ].c1.r2 + src2[ itr ].c1.r2;
+ dst[ itr ].c1.r3 = src1[ itr ].c1.r3 + src2[ itr ].c1.r3;
+
+ dst[ itr ].c2.r1 = src1[ itr ].c2.r1 + src2[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = src1[ itr ].c2.r2 + src2[ itr ].c2.r2;
+ dst[ itr ].c2.r3 = src1[ itr ].c2.r3 + src2[ itr ].c2.r3;
+
+ dst[ itr ].c3.r1 = src1[ itr ].c3.r1 + src2[ itr ].c3.r1;
+ dst[ itr ].c3.r2 = src1[ itr ].c3.r2 + src2[ itr ].c3.r2;
+ dst[ itr ].c3.r3 = src1[ itr ].c3.r3 + src2[ itr ].c3.r3;
+ );
+}
+
+ne10_result_t addmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = src1[ itr ].c1.r1 + src2[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src1[ itr ].c1.r2 + src2[ itr ].c1.r2;
+ dst[ itr ].c1.r3 = src1[ itr ].c1.r3 + src2[ itr ].c1.r3;
+ dst[ itr ].c1.r4 = src1[ itr ].c1.r4 + src2[ itr ].c1.r4;
+
+ dst[ itr ].c2.r1 = src1[ itr ].c2.r1 + src2[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = src1[ itr ].c2.r2 + src2[ itr ].c2.r2;
+ dst[ itr ].c2.r3 = src1[ itr ].c2.r3 + src2[ itr ].c2.r3;
+ dst[ itr ].c2.r4 = src1[ itr ].c2.r4 + src2[ itr ].c2.r4;
+
+ dst[ itr ].c3.r1 = src1[ itr ].c3.r1 + src2[ itr ].c3.r1;
+ dst[ itr ].c3.r2 = src1[ itr ].c3.r2 + src2[ itr ].c3.r2;
+ dst[ itr ].c3.r3 = src1[ itr ].c3.r3 + src2[ itr ].c3.r3;
+ dst[ itr ].c3.r4 = src1[ itr ].c3.r4 + src2[ itr ].c3.r4;
+
+ dst[ itr ].c4.r1 = src1[ itr ].c4.r1 + src2[ itr ].c4.r1;
+ dst[ itr ].c4.r2 = src1[ itr ].c4.r2 + src2[ itr ].c4.r2;
+ dst[ itr ].c4.r3 = src1[ itr ].c4.r3 + src2[ itr ].c4.r3;
+ dst[ itr ].c4.r4 = src1[ itr ].c4.r4 + src2[ itr ].c4.r4;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NE10_types.h"
+
+ne10_result_t addmat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count)
+{
+ return add_vec2f_neon ( (ne10_vec2f_t*) dst, (ne10_vec2f_t*) src1, (ne10_vec2f_t*) src2, count * 2);
+}
+
+ne10_result_t addmat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count)
+{
+ return add_vec3f_neon ( (ne10_vec3f_t*) dst, (ne10_vec3f_t*) src1, (ne10_vec3f_t*) src2, count * 3);
+}
+
+ne10_result_t addmat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count)
+{
+ return add_vec4f_neon ( (ne10_vec4f_t*) dst, (ne10_vec4f_t*) src1, (ne10_vec4f_t*) src2, count * 4);
+}
+
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_cross.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_cross.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t cross_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = (src1[ itr ].y * src2[ itr ].z) - (src1[ itr ].z * src2[ itr ].y);
+ dst[ itr ].y = (src1[ itr ].z * src2[ itr ].x) - (src1[ itr ].x * src2[ itr ].z);
+ dst[ itr ].z = (src1[ itr ].x * src2[ itr ].y) - (src1[ itr ].y * src2[ itr ].x);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_cross.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global cross_vec3f_neon
+ .thumb
+ .thumb_func
+cross_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t cross_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec3
+
+.L_residualloop_vec3:
+ @ process the last few items left in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+
+ subs r4, r4, #1
+
+ @ calculate values for
+ vmul.f32 d20, d2, d5
+ vmul.f32 d21, d4, d1
+ vmul.f32 d22, d0, d3
+
+ vmls.f32 d20, d3, d4
+ vmls.f32 d21, d5, d0
+ vmls.f32 d22, d1, d2
+
+ vst3.32 {d20[0], d21[0], d22[0]}, [r0]!
+
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r3, .L_return_vec3
+
+ @ load current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d26, d28, d30}, [r2]!
+ vld3.32 {d27, d29, d31}, [r2]!
+
+.L_mainloop_vec3:
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ vmul.f32 q10, q1, q15
+ vmul.f32 q11, q2, q13
+ vmul.f32 q12, q0, q14
+
+ vmls.f32 q10, q14, q2
+ vmls.f32 q11, q15, q0
+ vmls.f32 q12, q13, q1
+
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst3.32 {d20, d22, d24}, [r0]!
+ vst3.32 {d21, d23, d25}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next (e.g. 3rd) set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d26, d28, d30}, [r2]!
+ vld3.32 {d27, d29, d31}, [r2]!
+
+ bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_return_vec3:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_detmat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_detmat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+#include "NE10_detmat.c.h"
+
+#include <assert.h>
+
+ne10_result_t detmat_2x2f_c (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ] = DET2x2 (&src[ itr ]);
+ );
+}
+
+ne10_result_t detmat_3x3f_c (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ] = DET3x3 (& (src[ itr ]));
+
+ );
+}
+
+ne10_result_t detmat_4x4f_c (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ] = DET4x4 (&src[ itr ]);
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_detmat.c.h
+ */
+
+#ifndef __NE10_DETMAT_C_H__
+#define __NE10_DETMAT_C_H__
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+static inline ne10_float32_t DET2x2( ne10_mat2x2f_t * mat )
+{
+ // 2x2 matrix layout
+ // c1r1 c2r1
+ // c1r2 c2r2
+
+ return ( (mat->c1.r1 * mat->c2.r2)
+ -(mat->c2.r1 * mat->c1.r2) );
+}
+
+static inline ne10_float32_t DET3x3( ne10_mat3x3f_t * mat )
+{
+ // 3x3 matrix layout
+ // c1r1 c2r1 c3r1
+ // c1r2 c2r2 c3r2
+ // c1r3 c2r3 c3r3
+
+ ne10_mat2x2f_t subm11 = { {mat->c2.r2, mat->c2.r3}, {mat->c3.r2, mat->c3.r3} };
+ ne10_mat2x2f_t subm21 = { {mat->c1.r2, mat->c1.r3}, {mat->c3.r2, mat->c3.r3} };
+ ne10_mat2x2f_t subm31 = { {mat->c1.r2, mat->c1.r3}, {mat->c2.r2, mat->c2.r3} };
+ return (mat->c1.r1*DET2x2( &subm11 ))
+ - (mat->c2.r1*DET2x2( &subm21 ))
+ + (mat->c3.r1*DET2x2( &subm31 ));
+}
+
+static inline ne10_float32_t DET4x4( ne10_mat4x4f_t * mat )
+{
+ // 4x4 matrix layout
+ // c1r1 c2r1 c3r1 c4r1
+ // c1r2 c2r2 c3r2 c4r2
+ // c1r3 c2r3 c3r3 c4r3
+ // c1r4 c2r4 c3r4 c4r4
+
+ ne10_mat3x3f_t subm11 = { {mat->c2.r2, mat->c2.r3, mat->c2.r4},
+ {mat->c3.r2, mat->c3.r3, mat->c3.r4},
+ {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
+
+ ne10_mat3x3f_t subm21 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
+ {mat->c3.r2, mat->c3.r3, mat->c3.r4},
+ {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
+
+ ne10_mat3x3f_t subm31 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
+ {mat->c2.r2, mat->c2.r3, mat->c2.r4},
+ {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
+
+ ne10_mat3x3f_t subm41 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
+ {mat->c2.r2, mat->c2.r3, mat->c2.r4},
+ {mat->c3.r2, mat->c3.r3, mat->c3.r4} };
+
+ return (mat->c1.r1*DET3x3( &subm11 ))
+ - (mat->c2.r1*DET3x3( &subm21 ))
+ + (mat->c3.r1*DET3x3( &subm31 ))
+ - (mat->c4.r1*DET3x3( &subm41 ));
+}
+
+
+
+
+#endif
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_detmat.neon.inc.s
+@
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ Get determinants of two 2x2 matrices in dRes
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_DET_2x2MATS_ARGS dA, dB, dC, dD, dRes
+ vmul.f32 \dRes, \dA, \dD
+ vmls.f32 \dRes, \dB, \dC
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ Get negated determinants of two 2x2 matrices in dRes
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_NEG_DET_2x2MATS_ARGS dA, dB, dC, dD, dRes
+ GET_DET_2x2MATS_ARGS \dC, \dD, \dA, \dB, \dRes
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro used inside detmat_3x3f_neon() to load 3x3 matrices.
+ @ Two 3x3 matrices are loaded from the source address
+ @ into registers dst00-11. The corresponding qr00-qr05
+ @ registers are then rearranged so the order of the data fits the
+ @ code written in other macros below.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_3x3MATS_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, dst08, dst09, dst10, dst11, qr00, qr01, qr02, qr03, qr04, qr05, addr
+
+ vld3.32 { \dst00, \dst02, \dst04 }, [\addr]!
+ vld3.32 { \dst01[0], \dst03[0], \dst05[0] }, [\addr]!
+ vld3.32 { \dst06, \dst08, \dst10 }, [\addr]!
+ vld3.32 { \dst07[0], \dst09[0], \dst11[0] }, [\addr]!
+
+ vtrn.32 \qr00, \qr03
+ vtrn.32 \qr01, \qr04
+ vtrn.32 \qr02, \qr05
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the determinant of two 3x3 matrices
+ @ loaded using the above LOAD_3x3MATS_ARGS macro.
+ @ The result is stored in the \res register.
+ @ Registers \tmp2 and \tmp3 are used as scratch registers and will
+ @ not be restored in this macro - the caller needs to resotre them
+ @ if needed. Each of the aa-ii parameters can be a "d" register
+ @ containing two floating-point values which correspond to the
+ @ following reference matrix:
+ @
+ @ |aa dd gg|
+ @ M = |bb ee hh|
+ @ |cc ff ii|
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_DETERMINANT_of_3x3MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, res, tmp2, tmp3
+ @ det = a*(ei-fh) - d*(bi-ch) + g*(bf-ec)
+
+ vmul.f32 \res, \ee, \ii @ t1 = ei
+ vmul.f32 \tmp2, \bb, \ii @ t2 = bi
+ vmul.f32 \tmp3, \bb, \ff @ t3 = bf
+
+ vmls.f32 \res, \ff, \hh @ t1 = ei-fh
+ vmls.f32 \tmp2, \cc, \hh @ t2 = bi-ch
+ vmls.f32 \tmp3, \ee, \cc @ t3 = bf-ec
+
+ vmul.f32 \res, \aa, \res @ t1 = a*(ei-fh)
+ vmls.f32 \res, \dd, \tmp2 @ t1 = a*(ei-fh) - d*(bi-ch)
+ vmla.f32 \res, \gg, \tmp3 @ t1 = a*(ei-fh) - d*(bi-ch) + g*(bf-ec) = det(M1), det(M2)
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates nagated determinant of two 3x3 matrices
+ @ The result is stored in \res
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_NEG_DET_3x3MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, res, tmp2, tmp3
+ @ det = - a*(ei-fh) + d*(bi-ch) - g*(bf-ec)
+ GET_DETERMINANT_of_3x3MATS_ARGS \dd, \ee, \ff, \aa, \bb, \cc, \gg, \hh, \ii, \res, \tmp2, \tmp3 @ Using the column exchange property
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro used inside detmat_4x4f_neon() to load 4x4 matrices.
+ @ Two 4x4 matrices are loaded from the source address register \addr
+ @ into registers dst00-15. The corresponding qr00-qr07
+ @ registers are then rearranged so the order of the data fits the
+ @ code written in other macros below.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_4x4MATS_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, dst08, dst09, dst10, dst11, dst12, dst13, dst14, dst15, qr00, qr01, qr02, qr03, qr04, qr05, qr06, qr07, addr
+
+ vld4.32 { \dst00, \dst02, \dst04, \dst06 }, [\addr]!
+ vld4.32 { \dst01, \dst03, \dst05, \dst07 }, [\addr]!
+ vld4.32 { \dst08, \dst10, \dst12, \dst14 }, [\addr]!
+ vld4.32 { \dst09, \dst11, \dst13, \dst15 }, [\addr]!
+
+ vtrn.32 \qr00, \qr04
+ vtrn.32 \qr01, \qr05
+ vtrn.32 \qr02, \qr06
+ vtrn.32 \qr03, \qr07
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the determinant of 4x4 matrices
+ @ loaded using the above LOAD_4x4MATS_ARGS macro.
+ @ The result is stored in the \res register.
+ @ Registers \tmp2 to \tmp6 are used as scratch registers and will
+ @ not be restored in this macro - the caller needs to resotre them
+ @ if needed. Each of the aa-pp parameters can be a "d" register
+ @ containing two floating-point values which correspond to the
+ @ following reference matrix:
+ @
+ @ |aa ee ii mm|
+ @ M = |bb ff jj nn|
+ @ |cc gg kk oo|
+ @ |dd hh ll pp|
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_DETERMINANT_of_4x4MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, jj, kk, ll, mm, nn, oo, pp, res, tmp2, tmp3, tmp4, tmp5, tmp6
+
+ @ res = det(SubM11)
+ GET_DETERMINANT_of_3x3MATS_ARGS \ff, \gg, \hh, \jj, \kk, \ll, \nn, \oo, \pp, \res, \tmp5, \tmp6
+
+ @ tmp2 = det(SubM12)
+ GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \jj, \kk, \ll, \nn, \oo, \pp, \tmp2, \tmp5, \tmp6
+
+ @ tmp3 = det(SubM13)
+ GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \ff, \gg, \hh, \nn, \oo, \pp, \tmp3, \tmp5, \tmp6
+
+ @ tmp4 = det(SubM14)
+ GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \ff, \gg, \hh, \jj, \kk, \ll, \tmp4, \tmp5, \tmp6
+
+
+ vmul.f32 \res, \aa, \res
+ vmls.f32 \res, \ee, \tmp2
+ vmla.f32 \res, \ii, \tmp3
+ vmls.f32 \res, \mm, \tmp4
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro used inside detmat_4x4f_neon() to load four 4x4 matrices
+ @ from the memory location pointed to by the \addr register.
+ @ The loaded matrices are stored in registers dst00-07 and
+ @ finaklly rearranged using the corresponding registers qr00-qr03.
+ @ qtmp1-qtmp4 are scratch registers which are not resotred in this
+ @ maroc. The caller must restored them if needed.
+ @ NOTE: Through out Ne10, matrices are loaded and stored in
+ @ column major format.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_SINGLE_4x4MAT_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, qr00, qr01, qr02, qr03, qtmp1, qtmp2, qtmp3, qtmp4, addr
+
+ vld4.32 { \dst00, \dst02, \dst04, \dst06 }, [\addr]!
+ vld4.32 { \dst01, \dst03, \dst05, \dst07 }, [\addr]!
+
+ vtrn.32 \qr00, \qtmp1
+ vtrn.32 \qr01, \qtmp2
+ vtrn.32 \qr02, \qtmp3
+ vtrn.32 \qr03, \qtmp4
+ .endm
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_detmat.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+.include "NE10_detmat.neon.inc.s"
+
+
+
+ .align 4
+ .global detmat_2x2f_neon
+ .thumb
+ .thumb_func
+
+detmat_2x2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t detmat_2x2f(arm_float_t * dst,
+ @ arm_mat2x2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 matrices
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cbz r2, .L_check_mat2x2
+
+ @ We load four 2x2 matrices each time, calculate their
+ @ determinants, store the results in the destination
+ @ memory address, and move onto the next four.
+
+ @ load the 1st set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate values for current set
+ vmul.f32 q15, q0, q3
+ vmls.f32 q15, q1, q2
+
+ ble .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+ @ store the result for current set
+ vst1.32 {q15}, [r0]!
+
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate values for next set
+ vmul.f32 q15, q0, q3
+ vmls.f32 q15, q1, q2
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst1.32 {q15}, [r0]!
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld1.32 {d0, d1}, [r1]! @ Load matrix [A]
+
+ subs r3, r3, #1
+
+ @ calculate det([A]) = |A|
+ vrev64.32 d1, d1
+ vmul.f32 d2, d0, d1
+ vrev64.32 d2, d2
+ vmls.f32 d2, d0, d1 @ At this point d2 = { -|A|, |A| }
+
+ @ store the result which is in d2[1]
+ vst1.32 {d2[1]}, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global detmat_3x3f_neon
+ .thumb
+ .thumb_func
+detmat_3x3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t detmat_3x3f(arm_float_t * dst,
+ @ arm_mat3x3f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 matrices
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r2 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat3x3
+
+ @ We load two 3x3 matrices each time, calculate their
+ @ determinants, store the results in the destination
+ @ memory address, and move onto the next two.
+
+ @ load the 1st set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
+ subs r2, r2, #2
+
+ @ calculate values for the current set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
+
+ ble .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+ @ store the result for the current set
+ vst1.32 {d22}, [r0]!
+
+ @ load the next set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
+ subs r2, r2, #2
+
+ @ calculate values for the next set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst1.32 {d22}, [r0]!
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+
+ @ load the next (e.g. 3rd) set of values
+ vld3.32 { d0[0], d2[0], d4[0]}, [r1]!
+ vld3.32 { d1[0], d3[0], d5[0]}, [r1]!
+ vld3.32 {d16[0], d18[0], d20[0]}, [r1]!
+
+ subs r3, r3, #1
+
+ @ calculate values for the last (e.g. 3rd) set
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d1, d3, d5, d16, d18, d20, d22, d24, d26
+
+ @ store the result for the last (e.g. 3rd) set
+ vst1.32 {d22[0]}, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global detmat_4x4f_neon
+ .thumb
+ .thumb_func
+detmat_4x4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t detmat_float(arm_float_t * dst,
+ @ arm_mat4x4f_t * src1,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat4x4
+
+
+ @ We load two 4x4 matrices each time, calculate their
+ @ determinants, store the results in the destination
+ @ memory address, and move onto the next two.
+
+ @ load the 1st set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
+ subs r2, r2, #2
+
+ @ calculate values for the current set
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ ble .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+ @ store the result for the current set
+ vst1.32 {d24}, [r0]!
+
+ @ load the next set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
+ subs r2, r2, #2
+
+ @ calculate values for the next set
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ bgt .L_mainloop_mat4x4 @ loop if xx is > r2, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst1.32 {d24}, [r0]!
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ vld4.32 { d0[0], d2[0], d4[0], d6[0]}, [r1]!
+ vld4.32 { d1[0], d3[0], d5[0], d7[0]}, [r1]!
+ vld4.32 { d16[0], d18[0], d20[0], d22[0]}, [r1]!
+ vld4.32 { d17[0], d19[0], d21[0], d23[0]}, [r1]!
+
+
+
+ subs r3, r3, #1
+
+ @ calculate values
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d1, d3, d5, d7, d16, d18, d20, d22, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
+
+ @ store the results
+ vst1.32 {d24[0]}, [r0]!
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_div.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global div_float_asm
+ .thumb
+ .thumb_func
+
+div_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t div_float(arm_vec2f_t * dst,
+ @ arm_float_t * src1, const arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+ @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+ @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r3, .LoopEndFloat
+
+.LoopBeginFloat:
+ vldr s1, [r1] @ Load s1 = src1[i]
+ add r1, r1, #4 @ move to the next entry
+ vldr s2, [r2] @ Load s2 = src2[i]
+ add r2, r2, #4 @ next entry
+ vdiv.f32 s10, s1, s2 @ s10 = src1[i] / src2[i]
+ vstr s10, [r0] @ Store the result back into the main memory
+ add r0, r0, #4 @ next entry in the dst
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_div.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t div_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ] = src1[ itr ] / src2[ itr ];
+ );
+}
+
+ne10_result_t vdiv_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
+ );
+}
+
+ne10_result_t vdiv_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z / src2[ itr ].z;
+ );
+}
+
+ne10_result_t vdiv_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z / src2[ itr ].z;
+ dst[ itr ].w = src1[ itr ].w / src2[ itr ].w;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_div.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global div_float_neon
+ .thumb
+ .thumb_func
+
+div_float_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t div_float(arm_float_t * dst,
+ @ arm_float_t * src1,
+ @ arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_float
+
+.L_residualloop_float:
+ @ process the residual items in the input array
+ vld1.f32 d0[0], [r1]! @ Fill in d0[0]
+ vld1.f32 d1[0], [r2]! @ Fill in d1[1]
+
+
+ subs r4, r4, #1
+
+ @ values d0 = d0 / d1
+ vrecpe.f32 d3, d1
+ vrecps.f32 d1, d3, d1
+ vmul.f32 d3, d1, d3
+ vmul.f32 d0, d0, d3
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_residualloop_float
+
+.L_check_mainloop_float:
+ cbz r3, .L_return_float
+
+ @ load the current set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+
+.L_mainloop_float:
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ vrecpe.f32 q3, q1
+ vrecps.f32 q1, q3, q1
+ vmul.f32 q3, q1, q3
+ vmul.f32 q3, q0, q3
+
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst1.32 {d6,d7}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next (e.g. 3rd) set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+
+ bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
+
+.L_return_float:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global vdiv_vec2f_neon
+ .thumb
+ .thumb_func
+
+vdiv_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t div_float(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src1,
+ @ arm_vec2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec2
+
+.L_residualloop_vec2:
+ @ process the residual items in the input array
+ vld1.f32 d0, [r1]!
+ vld1.f32 d1, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ @ d0 = d0 / d1
+ vrecpe.f32 d4, d1
+ vrecps.f32 d1, d4, d1
+ vmul.f32 d4, d1, d4
+ vmul.f32 d0, d0, d4
+
+ vst1.32 {d0}, [r0]!
+
+ bgt .L_residualloop_vec2
+
+.L_check_mainloop_vec2:
+ cbz r3, .L_return_vec2
+
+ @ load the current set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+
+.L_mainloop_vec2:
+ @ calculate values for current set
+ @ q8 = q0 / q2
+ vrecpe.f32 q8, q2
+ vrecps.f32 q2, q8, q2
+ vmul.f32 q8, q2, q8
+ vmul.f32 q8, q0, q8
+
+ @ q9 = q1 / q3
+ vrecpe.f32 q9, q3
+ vrecps.f32 q3, q9, q3
+ vmul.f32 q9, q3, q9
+ vmul.f32 q9, q1, q9
+
+ @ store the result for current set
+ vst2.32 {d16,d17,d18,d19}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+
+ bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_return_vec2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+ .align 4
+ .global vdiv_vec3f_neon
+ .thumb
+ .thumb_func
+vdiv_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t div_float(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec3
+
+.L_residualloop_vec3:
+ @ process the residual items in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+
+ subs r4, r4, #1
+
+ @ calculate values for
+ vrecpe.f32 d18, d1
+ vrecps.f32 d1 , d18, d1
+ vmul.f32 d18, d1 , d18
+ vmul.f32 d0 , d0 , d18
+
+ vrecpe.f32 d20, d3
+ vrecps.f32 d3 , d20, d3
+ vmul.f32 d20, d3 , d20
+ vmul.f32 d2 , d2 , d20
+
+ vrecpe.f32 d22, d5
+ vrecps.f32 d5 , d22, d5
+ vmul.f32 d22, d5 , d22
+ vmul.f32 d4 , d4 , d22
+
+ vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
+
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r3, .L_return_vec3
+
+ @ load current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d18, d20, d22}, [r2]!
+ vld3.32 {d19, d21, d23}, [r2]!
+
+.L_mainloop_vec3:
+ @ calculate values for current set
+ @ q12 = q0 / q9
+ vrecpe.f32 q12, q9
+ vrecps.f32 q9 , q12, q9
+ vmul.f32 q12, q9 , q12
+ vmul.f32 q12, q0 , q12
+
+ @ q13 = q1 / q10
+ vrecpe.f32 q13, q10
+ vrecps.f32 q10 , q13, q10
+ vmul.f32 q13, q10 , q13
+ vmul.f32 q13, q1 , q13
+
+ @ q14 = q2 / q11
+ vrecpe.f32 q14, q11
+ vrecps.f32 q11 , q14, q11
+ vmul.f32 q14, q11 , q14
+ vmul.f32 q14, q2 , q14
+
+ @ store the result for current set
+ vst3.32 {d24, d26, d28}, [r0]!
+ vst3.32 {d25, d27, d29}, [r0]!
+ subs r3, r3, #1
+
+ @ load next set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d18, d20, d22}, [r2]!
+ vld3.32 {d19, d21, d23}, [r2]!
+
+ bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_return_vec3:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global vdiv_vec4f_neon
+ .thumb
+ .thumb_func
+vdiv_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t div_float(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src1,
+ @ arm_vec4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec4
+
+.L_residualloop_vec4:
+ @ process the last few items left in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, V1.y, V1.z, V1.w };
+ vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
+ @ q1 = { V2.x, V2.y, V2.z, V2.w };
+
+ subs r4, r4, #1
+
+ @ calculate values
+ @ q0 = q0 / q1
+ vrecpe.f32 q2, q1
+ vrecps.f32 q1 , q2, q1
+ vmul.f32 q2, q1 , q2
+ vmul.f32 q0 , q0 , q2
+
+ vst1.32 {d0, d1}, [r0]!
+
+ bgt .L_residualloop_vec4
+
+.L_check_mainloop_vec4:
+ cbz r3, .L_return_vec4
+
+ @ load the current set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+.L_mainloop_vec4:
+ @ calculate values for current set
+ @ q12 = q0 / q8
+ vrecpe.f32 q12, q8
+ vrecps.f32 q8 , q12, q8
+ vmul.f32 q12, q8 , q12
+ vmul.f32 q12, q0 , q12
+
+ @ q13 = q1 / q9
+ vrecpe.f32 q13, q9
+ vrecps.f32 q9 , q13, q9
+ vmul.f32 q13, q9 , q13
+ vmul.f32 q13, q1 , q13
+
+ @ q14 = q2 / q10
+ vrecpe.f32 q14, q10
+ vrecps.f32 q10 , q14, q10
+ vmul.f32 q14, q10 , q14
+ vmul.f32 q14, q2 , q14
+
+ @ q15 = q3 / q11
+ vrecpe.f32 q15, q11
+ vrecps.f32 q11 , q15, q11
+ vmul.f32 q15, q11 , q15
+ vmul.f32 q15, q3 , q15
+
+ @ store the result for current set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+ bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_return_vec4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_divc.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global divc_float_asm
+ .thumb
+ .thumb_func
+
+divc_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t divc_float(arm_vec2f_t * dst,
+ @ arm_float_t * src, const arm_float_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndFloat
+ mov r5, #0
+
+.LoopBeginFloat:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i]
+ vmov s3, r2 @ Get cst into register s3
+ vdiv.f32 s10, s1, s3 @ s10 = src[i] / cst
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the result back into the main memory
+ add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global divc_vec2f_asm
+ .thumb
+ .thumb_func
+
+divc_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t divc_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec2F
+ mov r5, #0
+
+.LoopBeginVec2F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x and src[i].y
+ vldr s2, [r6, #4]
+ vldr s3, [r2, #0] @ Load cst->x and cst->y
+ vldr s4, [r2, #4]
+ vdiv.f32 s10, s1, s3 @ s10 = src[i].x / cst->x
+ vdiv.f32 s11, s2, s4 @ s11 = src[i].y / cst->y
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global divc_vec3f_asm
+ .thumb
+ .thumb_func
+
+divc_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t divc_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec3F
+ mov r5, #0
+
+.LoopBeginVec3F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
+ vldr s5, [r2, #4]
+ vldr s6, [r2, #8]
+ vdiv.f32 s10, s1, s4 @ s10 = src[i].x / cst->x
+ vdiv.f32 s11, s2, s5 @ s11 = src[i].y / cst->y
+ vdiv.f32 s12, s3, s6 @ s12 = src[i].z / cst->z
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global divc_vec4f_asm
+ .thumb
+ .thumb_func
+
+divc_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t divc_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec4F
+ mov r5, #0
+
+.LoopBeginVec4F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r6, #12]
+ vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
+ vldr s6, [r2, #4]
+ vldr s7, [r2, #8]
+ vldr s8, [r2, #12]
+ vdiv.f32 s10, s1, s5 @ s10 = src[i].x / cst->x
+ vdiv.f32 s11, s2, s6 @ s11 = src[i].y / cst->y
+ vdiv.f32 s12, s3, s7 @ s12 = src[i].z / cst->z
+ vdiv.f32 s13, s4, s8 @ s13 = src[i].w / cst->w
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ vstr s13, [r7, #12]
+ add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_divc.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t divc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ] = src[ itr ] / cst;
+ );
+}
+
+ne10_result_t divc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x / cst->x;
+ dst[ itr ].y = src[ itr ].y / cst->y;
+ );
+}
+
+ne10_result_t divc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x / cst->x;
+ dst[ itr ].y = src[ itr ].y / cst->y;
+ dst[ itr ].z = src[ itr ].z / cst->z;
+ );
+}
+
+ne10_result_t divc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x / cst->x;
+ dst[ itr ].y = src[ itr ].y / cst->y;
+ dst[ itr ].z = src[ itr ].z / cst->z;
+ dst[ itr ].w = src[ itr ].w / cst->w;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_divc.neon.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+ne10_result_t divc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ ne10_uint32_t ii = 0;
+ ne10_float32_t d[4];
+ NE10_XC_OPERATION_FLOAT_NEON
+ (
+ /* a single division operation */
+ float32x4_t rec = vrecpeq_f32 (n_cst);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
+ n_dst = vmulq_f32 (n_src , rec);
+ ,
+ /* a single division operation */
+ float32x2_t rec = vrecpe_f32 (n_tmp_cst);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
+ n_tmp_src = vmul_f32 (n_tmp_src, rec);
+ );
+}
+
+ne10_result_t divc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC2F_NEON
+ (
+ /* a single division operation */
+ float32x4_t rec = vrecpeq_f32 (n_cst);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
+ n_dst = vmulq_f32 (n_src , rec);
+ ,
+ /* a single division operation */
+ float32x2_t rec = vrecpe_f32 (n_tmp_cst);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
+ n_tmp_src = vmul_f32 (n_tmp_src, rec);
+ );
+}
+
+ne10_result_t divc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC3F_NEON
+ (
+ /* three division operations */
+ float32x4_t rec = vrecpeq_f32 (n_cst1);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst1, rec), rec);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst1, rec), rec);
+ n_dst1 = vmulq_f32 (n_src1 , rec);
+
+ rec = vrecpeq_f32 (n_cst2);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst2, rec), rec);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst2, rec), rec);
+ n_dst2 = vmulq_f32 (n_src2 , rec);
+
+ rec = vrecpeq_f32 (n_cst3);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst3, rec), rec);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst3, rec), rec);
+ n_dst3 = vmulq_f32 (n_src3 , rec);
+ ,
+ /* three division operations */
+ float32x2_t rec = vrecpe_f32 (n_tmp_cst.val[0]);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[0], rec), rec);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[0], rec), rec);
+ n_tmp_src.val[0] = vmul_f32 (n_tmp_src.val[0] , rec);
+
+ rec = vrecpe_f32 (n_tmp_cst.val[1]);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[1], rec), rec);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[1], rec), rec);
+ n_tmp_src.val[1] = vmul_f32 (n_tmp_src.val[1] , rec);
+
+ rec = vrecpe_f32 (n_tmp_cst.val[2]);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[2], rec), rec);
+ rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[2], rec), rec);
+ n_tmp_src.val[2] = vmul_f32 (n_tmp_src.val[2] , rec);
+ );
+}
+
+ne10_result_t divc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC4F_NEON
+ (
+ /* a single division operation */
+ float32x4_t rec = vrecpeq_f32 (n_cst);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
+ rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
+ n_dst = vmulq_f32 (n_src , rec);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_dot.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_dot.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t dot_vec2f_c (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count)
+{
+ NE10_DOT_OPERATION_X_C
+ (
+ dst[ itr ] = src1[ itr ].x * src2[ itr ].x +
+ src1[ itr ].y * src2[ itr ].y ;
+ );
+}
+
+ne10_result_t dot_vec3f_c (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_DOT_OPERATION_X_C
+ (
+ dst[ itr ] = src1[ itr ].x * src2[ itr ].x +
+ src1[ itr ].y * src2[ itr ].y +
+ src1[ itr ].z * src2[ itr ].z ;
+ );
+}
+
+ne10_result_t dot_vec4f_c (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count)
+{
+ NE10_DOT_OPERATION_X_C
+ (
+ dst[ itr ] = src1[ itr ].x * src2[ itr ].x +
+ src1[ itr ].y * src2[ itr ].y +
+ src1[ itr ].z * src2[ itr ].z +
+ src1[ itr ].w * src2[ itr ].w ;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_dot.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global dot_vec2f_neon
+ .thumb
+ .thumb_func
+
+dot_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t dot_float(arm_float_t * dst,
+ @ arm_vec2f_t * src1,
+ @ arm_vec2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec2
+
+.L_residualloop_vec2:
+ @ process the residual items in the input array
+ vld1.f32 d0, [r1]!
+ vld1.f32 d1, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vmul.f32 d0, d0, d1
+ vpadd.f32 d0, d0
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_residualloop_vec2
+
+.L_check_mainloop_vec2:
+ cbz r3, .L_return_vec2
+
+ @ load the current set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+
+.L_mainloop_vec2:
+ @ calculate values for current set
+ vmul.f32 q8, q0, q2
+ vmla.f32 q8, q1, q3
+
+ @ store the result for current set
+ vst1.32 {d16,d17}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+
+ bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_return_vec2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global dot_vec3f_neon
+ .thumb
+ .thumb_func
+dot_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t dot_float(arm_float_t * dst,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec3
+
+.L_residualloop_vec3:
+ @ process the residual items in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+
+ subs r4, r4, #1
+
+ @ calculate values for
+ vmul.f32 d0, d0, d1
+ vmla.f32 d0, d2, d3
+ vmla.f32 d0, d4, d5
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_residualloop_vec3
+
+.L_check_mainloop_vec3:
+ cbz r3, .L_return_vec3
+
+ @ load current set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d16, d18, d20}, [r2]!
+ vld3.32 {d17, d19, d21}, [r2]!
+
+.L_mainloop_vec3:
+ @ calculate values for current set
+ vmul.f32 q15, q0, q8
+ vmla.f32 q15, q1, q9
+ vmla.f32 q15, q2, q10
+
+ @ store the result for current set
+ vst1.32 {d30, d31}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d16, d18, d20}, [r2]!
+ vld3.32 {d17, d19, d21}, [r2]!
+
+ bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_return_vec3:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global dot_vec4f_neon
+ .thumb
+ .thumb_func
+dot_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t dot_float(arm_float_t * dst,
+ @ arm_vec4f_t * src1,
+ @ arm_vec4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are residual that will be processed at the begin of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
+ asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
+
+ cbz r4, .L_check_mainloop_vec4
+
+.L_residualloop_vec4:
+ @ process the residual items in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, V1.y, V1.z, V1.w };
+ vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
+ @ q1 = { V2.x, V2.y, V2.z, V2.w };
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vmul.f32 q0, q0, q1
+ vadd.f32 d0, d0, d1
+ vpadd.f32 d0, d0
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_residualloop_vec4
+
+.L_check_mainloop_vec4:
+ cbz r3, .L_return_vec4
+
+ @ load current set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+.L_mainloop_vec4:
+ @ calculate values for current set
+ vmul.f32 q15, q0, q8
+ vmla.f32 q15, q1, q9
+ vmla.f32 q15, q2, q10
+ vmla.f32 q15, q3, q11
+
+ @ store the result for current set
+ vst1.32 {d30, d31}, [r0]!
+ subs r3, r3, #1
+
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+ bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_return_vec4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_identitymat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_identitymat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+ne10_result_t identitymat_2x2f_c (ne10_mat2x2f_t * dst, ne10_uint32_t count)
+{
+ ne10_mat2x2f_t *src = dst; // dummy placeholder
+
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ].c1.r1 = 1.0f;
+ dst[ itr ].c1.r2 = 0.0f;
+ dst[ itr ].c2.r1 = 0.0f;
+ dst[ itr ].c2.r2 = 1.0f;
+ );
+}
+
+ne10_result_t identitymat_3x3f_c (ne10_mat3x3f_t * dst, ne10_uint32_t count)
+{
+ ne10_mat3x3f_t *src = dst; // dummy placeholder
+
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ].c1.r1 = 1.0f;
+ dst[ itr ].c1.r2 = 0.0f;
+ dst[ itr ].c1.r3 = 0.0f;
+
+ dst[ itr ].c2.r1 = 0.0f;
+ dst[ itr ].c2.r2 = 1.0f;
+ dst[ itr ].c2.r3 = 0.0f;
+
+ dst[ itr ].c3.r1 = 0.0f;
+ dst[ itr ].c3.r2 = 0.0f;
+ dst[ itr ].c3.r3 = 1.0f;
+ );
+}
+
+ne10_result_t identitymat_4x4f_c (ne10_mat4x4f_t * dst, ne10_uint32_t count)
+{
+ ne10_mat4x4f_t *src = dst; // dummy placeholder
+
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ].c1.r1 = 1.0f;
+ dst[ itr ].c1.r2 = 0.0f;
+ dst[ itr ].c1.r3 = 0.0f;
+ dst[ itr ].c1.r4 = 0.0f;
+
+ dst[ itr ].c2.r1 = 0.0f;
+ dst[ itr ].c2.r2 = 1.0f;
+ dst[ itr ].c2.r3 = 0.0f;
+ dst[ itr ].c2.r4 = 0.0f;
+
+ dst[ itr ].c3.r1 = 0.0f;
+ dst[ itr ].c3.r2 = 0.0f;
+ dst[ itr ].c3.r3 = 1.0f;
+ dst[ itr ].c3.r4 = 0.0f;
+
+ dst[ itr ].c4.r1 = 0.0f;
+ dst[ itr ].c4.r2 = 0.0f;
+ dst[ itr ].c4.r3 = 0.0f;
+ dst[ itr ].c4.r4 = 1.0f;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_identitymat.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .balign 4
+ .global identitymat_2x2f_neon
+ .thumb
+ .thumb_func
+
+identitymat_2x2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t identitymat_2x2f(arm_mat2x2f_t * dst,
+ @ arm_mat2x2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r2: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r2, r1, #3 @ r2 = count % 4;
+ sub r1, r1, r2 @ count = count - r1; This is what's left to be processed after this loop
+
+ vmov.f32 d2, 0.0
+ vmov.f32 d3, 0.0
+ vmov.f32 d0, 1.0
+ vmov.f32 d1, 1.0
+
+
+ vmov q3, q0
+ vmov q2, q1
+
+ cmp r1, #0
+ beq .L_check_mat2x2
+
+.L_mainloop_mat2x2:
+
+ subs r1, r1, #4
+
+ vst4.32 {d0, d2, d4, d6}, [r0]!
+ vst4.32 {d1, d3, d5, d7}, [r0]!
+
+ bgt .L_mainloop_mat2x2 @ loop if r1 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r2, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vswp d18, d20
+
+ subs r2, r2, #1
+
+ vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global identitymat_3x3f_neon
+ .thumb
+ .thumb_func
+identitymat_3x3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t identitymat_3x3f(arm_mat3x3f_t * dst,
+ @ arm_mat3x3f_t * src1,
+ @ arm_mat3x3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r2: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r2, r1, #3 @ r1 = count % 4;
+ sub r1, r1, r2 @ count = count - r1; This is what's left to be processed after this loop
+
+ vmov.f32 d2, 0.0
+ vmov.f32 d3, 0.0
+ vmov.f32 d0, 1.0
+ vmov.f32 d1, 1.0
+
+ vmov q8 , q1
+ vmov q9 , q1
+ vmov q10, q1
+ vmov q11, q1
+ vmov q12, q1
+ vmov q13, q1
+
+ vtrn.32 d2, d0 @ d0 = {0.0f, 1.0f}
+ vtrn.32 d1, d3 @ d1 = {1.0f, 0.0f}
+
+ vmov d16, d1
+ vmov d18, d0
+ vmov d21, d1
+ vmov d22, d1
+ vmov d24, d0
+ vmov d27, d1
+
+ cmp r1, #0
+ beq .L_check_mat3x3
+
+.L_mainloop_mat3x3:
+
+ subs r1, r1, #2
+
+ vst3.32 { d16 , d18 , d20 }, [r0]!
+ vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
+ vst3.32 { d22 , d24 , d26 }, [r0]!
+ vst3.32 { d23[0], d25[0], d27[0]}, [r0]!
+
+ bgt .L_mainloop_mat3x3 @ loop if r1 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r2, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+
+ subs r2, r2, #1
+
+ vst3.32 { d16 , d18 , d20 }, [r0]!
+ vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global identitymat_4x4f_neon
+ .thumb
+ .thumb_func
+identitymat_4x4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t identitymat_4x4f(arm_mat4x4f_t * dst,
+ @ arm_mat4x4f_t * src1,
+ @ arm_mat4x4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r2: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r2, r1, #3 @ r2 = count % 4;
+ sub r1, r1, r2 @ count = count - r2; This is what's left to be processed after this loop
+
+ vmov.f32 d2, 0.0
+ vmov.f32 d3, 0.0
+ vmov.f32 d0, 1.0
+ vmov.f32 d1, 1.0
+
+ vmov q8 , q1
+ vmov q9 , q1
+ vmov q10, q1
+ vmov q11, q1
+ vmov q12, q1
+ vmov q13, q1
+ vmov q14, q1
+ vmov q15, q1
+
+ vtrn.32 d2, d0 @ d0 = {0.0f, 1.0f}
+ vtrn.32 d1, d3 @ d1 = {1.0f, 0.0f}
+
+ vmov d16, d1
+ vmov d18, d0
+ vmov d21, d1
+ vmov d23, d0
+
+ vmov d24, d1
+ vmov d26, d0
+ vmov d29, d1
+ vmov d31, d0
+
+ cmp r1, #0
+ beq .L_check_mat4x4
+
+.L_mainloop_mat4x4:
+
+ subs r1, r1, #2
+
+ vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
+ vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
+ vst4.32 { d24 , d26 , d28 , d30 }, [r0]!
+ vst4.32 { d25 , d27 , d29 , d31 }, [r0]!
+
+ bgt .L_mainloop_mat4x4 @ loop if r1 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r2, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+
+ subs r2, r2, #1
+
+ vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
+ vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
+
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ mov r0, #0
+ bx lr
+
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+
+#include "NE10_math.h"
+
+ne10_result_t NE10_init_math (int is_NEON_available)
+{
+ if (NE10_OK == is_NEON_available)
+ {
+ addc_float = addc_float_neon;
+ addc_vec2f = addc_vec2f_neon;
+ addc_vec3f = addc_vec3f_neon;
+ addc_vec4f = addc_vec4f_neon;
+ subc_float = subc_float_neon;
+ subc_vec2f = subc_vec2f_neon;
+ subc_vec3f = subc_vec3f_neon;
+ subc_vec4f = subc_vec4f_neon;
+ rsbc_float = rsbc_float_neon;
+ rsbc_vec2f = rsbc_vec2f_neon;
+ rsbc_vec3f = rsbc_vec3f_neon;
+ rsbc_vec4f = rsbc_vec4f_neon;
+ mulc_float = mulc_float_neon;
+ mulc_vec2f = mulc_vec2f_neon;
+ mulc_vec3f = mulc_vec3f_neon;
+ mulc_vec4f = mulc_vec4f_neon;
+ divc_float = divc_float_neon;
+ divc_vec2f = divc_vec2f_neon;
+ divc_vec3f = divc_vec3f_neon;
+ divc_vec4f = divc_vec4f_neon;
+ setc_float = setc_float_neon;
+ setc_vec2f = setc_vec2f_neon;
+ setc_vec3f = setc_vec3f_neon;
+ setc_vec4f = setc_vec4f_neon;
+ mlac_float = mlac_float_neon;
+ mlac_vec2f = mlac_vec2f_neon;
+ mlac_vec3f = mlac_vec3f_neon;
+ mlac_vec4f = mlac_vec4f_neon;
+ add_float = add_float_neon;
+ sub_float = sub_float_neon;
+ mul_float = mul_float_neon;
+ div_float = div_float_neon;
+ mla_float = mla_float_neon;
+ abs_float = abs_float_neon;
+ len_vec2f = len_vec2f_neon;
+ len_vec3f = len_vec3f_neon;
+ len_vec4f = len_vec4f_neon;
+ normalize_vec2f = normalize_vec2f_neon;
+ normalize_vec3f = normalize_vec3f_neon;
+ normalize_vec4f = normalize_vec4f_neon;
+
+ abs_vec2f = abs_vec2f_neon;
+ abs_vec3f = abs_vec3f_neon;
+ abs_vec4f = abs_vec4f_neon;
+ vmul_vec2f = vmul_vec2f_neon;
+ vmul_vec3f = vmul_vec3f_neon;
+ vmul_vec4f = vmul_vec4f_neon;
+ vdiv_vec2f = vdiv_vec2f_neon;
+ vdiv_vec3f = vdiv_vec3f_neon;
+ vdiv_vec4f = vdiv_vec4f_neon;
+ vmla_vec2f = vmla_vec2f_neon;
+ vmla_vec3f = vmla_vec3f_neon;
+ vmla_vec4f = vmla_vec4f_neon;
+ add_vec2f = add_vec2f_neon;
+ add_vec3f = add_vec3f_neon;
+ add_vec4f = add_vec4f_neon;
+ sub_vec2f = sub_vec2f_neon;
+ sub_vec3f = sub_vec3f_neon;
+ sub_vec4f = sub_vec4f_neon;
+ dot_vec2f = dot_vec2f_neon;
+ dot_vec3f = dot_vec3f_neon;
+ dot_vec4f = dot_vec4f_neon;
+ cross_vec3f = cross_vec3f_neon;
+
+ addmat_2x2f = addmat_2x2f_neon;
+ addmat_3x3f = addmat_3x3f_neon;
+ addmat_4x4f = addmat_4x4f_neon;
+ submat_2x2f = submat_2x2f_neon;
+ submat_3x3f = submat_3x3f_neon;
+ submat_4x4f = submat_4x4f_neon;
+ mulmat_2x2f = mulmat_2x2f_neon;
+ mulmat_3x3f = mulmat_3x3f_neon;
+ mulmat_4x4f = mulmat_4x4f_neon;
+ mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_neon;
+ mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_neon;
+ mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_neon;
+ detmat_2x2f = detmat_2x2f_neon;
+ detmat_3x3f = detmat_3x3f_neon;
+ detmat_4x4f = detmat_4x4f_neon;
+ invmat_2x2f = invmat_2x2f_neon;
+ invmat_3x3f = invmat_3x3f_neon;
+ invmat_4x4f = invmat_4x4f_neon;
+ transmat_4x4f = transmat_4x4f_neon;
+ identitymat_4x4f = identitymat_4x4f_neon;
+ transmat_3x3f = transmat_3x3f_neon;
+ identitymat_3x3f = identitymat_3x3f_neon;
+ transmat_2x2f = transmat_2x2f_neon;
+ identitymat_2x2f = identitymat_2x2f_neon;
+ }
+ else
+ {
+ addc_float = addc_float_c;
+ addc_vec2f = addc_vec2f_c;
+ addc_vec3f = addc_vec3f_c;
+ addc_vec4f = addc_vec4f_c;
+ subc_float = subc_float_c;
+ subc_vec2f = subc_vec2f_c;
+ subc_vec3f = subc_vec3f_c;
+ subc_vec4f = subc_vec4f_c;
+ rsbc_float = rsbc_float_c;
+ rsbc_vec2f = rsbc_vec2f_c;
+ rsbc_vec3f = rsbc_vec3f_c;
+ rsbc_vec4f = rsbc_vec4f_c;
+ mulc_float = mulc_float_c;
+ mulc_vec2f = mulc_vec2f_c;
+ mulc_vec3f = mulc_vec3f_c;
+ mulc_vec4f = mulc_vec4f_c;
+ divc_float = divc_float_c;
+ divc_vec2f = divc_vec2f_c;
+ divc_vec3f = divc_vec3f_c;
+ divc_vec4f = divc_vec4f_c;
+ setc_float = setc_float_c;
+ setc_vec2f = setc_vec2f_c;
+ setc_vec3f = setc_vec3f_c;
+ setc_vec4f = setc_vec4f_c;
+ mlac_float = mlac_float_c;
+ mlac_vec2f = mlac_vec2f_c;
+ mlac_vec3f = mlac_vec3f_c;
+ mlac_vec4f = mlac_vec4f_c;
+ add_float = add_float_c;
+ sub_float = sub_float_c;
+ mul_float = mul_float_c;
+ div_float = div_float_c;
+ mla_float = mla_float_c;
+ abs_float = abs_float_c;
+ len_vec2f = len_vec2f_c;
+ len_vec3f = len_vec3f_c;
+ len_vec4f = len_vec4f_c;
+ normalize_vec2f = normalize_vec2f_c;
+ normalize_vec3f = normalize_vec3f_c;
+ normalize_vec4f = normalize_vec4f_c;
+
+ abs_vec2f = abs_vec2f_c;
+ abs_vec3f = abs_vec3f_c;
+ abs_vec4f = abs_vec4f_c;
+ vmul_vec2f = vmul_vec2f_c;
+ vmul_vec3f = vmul_vec3f_c;
+ vmul_vec4f = vmul_vec4f_c;
+ vdiv_vec2f = vdiv_vec2f_c;
+ vdiv_vec3f = vdiv_vec3f_c;
+ vdiv_vec4f = vdiv_vec4f_c;
+ vmla_vec2f = vmla_vec2f_c;
+ vmla_vec3f = vmla_vec3f_c;
+ vmla_vec4f = vmla_vec4f_c;
+ add_vec2f = add_vec2f_c;
+ add_vec3f = add_vec3f_c;
+ add_vec4f = add_vec4f_c;
+ sub_vec2f = sub_vec2f_c;
+ sub_vec3f = sub_vec3f_c;
+ sub_vec4f = sub_vec4f_c;
+ dot_vec2f = dot_vec2f_c;
+ dot_vec3f = dot_vec3f_c;
+ dot_vec4f = dot_vec4f_c;
+ cross_vec3f = cross_vec3f_c;
+
+ addmat_2x2f = addmat_2x2f_c;
+ addmat_3x3f = addmat_3x3f_c;
+ addmat_4x4f = addmat_4x4f_c;
+ submat_2x2f = submat_2x2f_c;
+ submat_3x3f = submat_3x3f_c;
+ submat_4x4f = submat_4x4f_c;
+ mulmat_2x2f = mulmat_2x2f_c;
+ mulmat_3x3f = mulmat_3x3f_c;
+ mulmat_4x4f = mulmat_4x4f_c;
+ mulcmatvec_cm2x2f_v2f = mulcmatvec_cm2x2f_v2f_c;
+ mulcmatvec_cm3x3f_v3f = mulcmatvec_cm3x3f_v3f_c;
+ mulcmatvec_cm4x4f_v4f = mulcmatvec_cm4x4f_v4f_c;
+ detmat_2x2f = detmat_2x2f_c;
+ detmat_3x3f = detmat_3x3f_c;
+ detmat_4x4f = detmat_4x4f_c;
+ invmat_2x2f = invmat_2x2f_c;
+ invmat_3x3f = invmat_3x3f_c;
+ invmat_4x4f = invmat_4x4f_c;
+ transmat_4x4f = transmat_4x4f_c;
+ identitymat_4x4f = identitymat_4x4f_c;
+ transmat_3x3f = transmat_3x3f_c;
+ identitymat_3x3f = identitymat_3x3f_c;
+ transmat_2x2f = transmat_2x2f_c;
+ identitymat_2x2f = identitymat_2x2f_c;
+ }
+}
+
+// These are actual definitions of our function pointers that are declared in inc/NE10_types.h
+ne10_result_t (*addc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*addc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*addc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*addc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*subc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*subc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*subc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*subc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*rsbc_float) (ne10_float32_t * dst, ne10_float32_t *src, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*rsbc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*rsbc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*rsbc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*mulc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*mulc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*mulc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*mulc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*divc_float) (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*divc_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*divc_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*divc_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*setc_float) (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*setc_vec2f) (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*setc_vec3f) (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*setc_vec4f) (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*mlac_float) (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count);
+ne10_result_t (*mlac_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count);
+ne10_result_t (*mlac_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count);
+ne10_result_t (*mlac_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count);
+ne10_result_t (*add_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+ne10_result_t (*sub_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+ne10_result_t (*mul_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+ne10_result_t (*div_float) (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+ne10_result_t (*mla_float) (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count);
+ne10_result_t (*abs_float) (ne10_float32_t * dst, ne10_float32_t * src, ne10_uint32_t count);
+ne10_result_t (*len_vec2f) (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+ne10_result_t (*len_vec3f) (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+ne10_result_t (*len_vec4f) (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+ne10_result_t (*normalize_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+ne10_result_t (*normalize_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+ne10_result_t (*normalize_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+
+ne10_result_t (*abs_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count);
+ne10_result_t (*abs_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count);
+ne10_result_t (*abs_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count);
+ne10_result_t (*vmul_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vmul_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vmul_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vdiv_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vdiv_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vdiv_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vmla_vec2f) (ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vmla_vec3f) (ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*vmla_vec4f) (ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*add_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*add_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*add_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*sub_vec2f) (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*sub_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*sub_vec4f) (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*dot_vec2f) (ne10_float32_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*dot_vec3f) (ne10_float32_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*dot_vec4f) (ne10_float32_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*cross_vec3f) (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count);
+
+ne10_result_t (*addmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*addmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*addmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*submat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*submat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*submat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*mulmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count);
+ne10_result_t (*mulmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count);
+ne10_result_t (*mulmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count);
+ne10_result_t (*mulcmatvec_cm4x4f_v4f) (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count);
+ne10_result_t (*mulcmatvec_cm3x3f_v3f) (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count);
+ne10_result_t (*mulcmatvec_cm2x2f_v2f) (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count);
+ne10_result_t (*detmat_4x4f) (ne10_float32_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+ne10_result_t (*detmat_3x3f) (ne10_float32_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+ne10_result_t (*detmat_2x2f) (ne10_float32_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+ne10_result_t (*invmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+ne10_result_t (*invmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+ne10_result_t (*invmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+ne10_result_t (*transmat_4x4f) (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count);
+ne10_result_t (*identitymat_4x4f) (ne10_mat4x4f_t * dst, ne10_uint32_t count);
+ne10_result_t (*transmat_3x3f) (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count);
+ne10_result_t (*identitymat_3x3f) (ne10_mat3x3f_t * dst, ne10_uint32_t count);
+ne10_result_t (*transmat_2x2f) (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count);
+ne10_result_t (*identitymat_2x2f) (ne10_mat2x2f_t * dst, ne10_uint32_t count);
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_invmat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_invmat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+#include "NE10_detmat.c.h"
+#include <math.h>
+
+#include <assert.h>
+
+// This macro is used to determine floating point values that are small enough to be consiedered nearly zero
+#define IS_FLOAT_NEAR_ZERO(x) ( ((fabs(x))<(1e-12)) ? 1 : 0 )
+
+ne10_result_t invmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count)
+{
+ ne10_float32_t det = 0.0f;
+
+ NE10_DETMAT_OPERATION_X_C
+ (
+ det = DET2x2 (&src[ itr ]);
+
+ if (1 == IS_FLOAT_NEAR_ZERO (det))
+ {
+ det = 1.0f;
+ }
+
+ det = 1.0f / det;
+ dst[ itr ].c1.r1 = det * src[ itr ].c2.r2;
+ dst[ itr ].c1.r2 = -1 * det * src[ itr ].c1.r2;
+ dst[ itr ].c2.r1 = -1 * det * src[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = det * src[ itr ].c1.r1;
+ );
+}
+
+ne10_result_t invmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count)
+{
+#define aa (src[ itr ].c1.r1)
+#define bb (src[ itr ].c1.r2)
+#define cc (src[ itr ].c1.r3)
+#define dd (src[ itr ].c2.r1)
+#define ee (src[ itr ].c2.r2)
+#define ff (src[ itr ].c2.r3)
+#define gg (src[ itr ].c3.r1)
+#define hh (src[ itr ].c3.r2)
+#define ii (src[ itr ].c3.r3)
+
+ ne10_float32_t det = 0.0f;
+ ne10_mat2x2f_t A, B, C, D, E, F, G, H, I;
+
+ NE10_DETMAT_OPERATION_X_C
+ (
+ det = DET3x3 (&src[ itr ]);
+
+ if (1 == IS_FLOAT_NEAR_ZERO (det))
+ {
+ det = 1.0f;
+ }
+ det = 1.0f / det;
+
+ // Calculate the coefficients
+ createColumnMajorMatrix2x2 (&A, ee, ff, hh, ii);
+ createColumnMajorMatrix2x2 (&B, dd, ff, gg, ii);
+ createColumnMajorMatrix2x2 (&C, dd, ee, gg, hh);
+ createColumnMajorMatrix2x2 (&D, bb, cc, hh, ii);
+ createColumnMajorMatrix2x2 (&E, aa, cc, gg, ii);
+ createColumnMajorMatrix2x2 (&F, aa, bb, gg, hh);
+ createColumnMajorMatrix2x2 (&G, bb, cc, ee, ff);
+ createColumnMajorMatrix2x2 (&H, aa, cc, dd, ff);
+ createColumnMajorMatrix2x2 (&I, aa, bb, dd, ee);
+
+ dst[ itr ].c1.r1 = det * DET2x2 (&A);
+ dst[ itr ].c1.r2 = -1.0f * det * DET2x2 (&D);
+ dst[ itr ].c1.r3 = det * DET2x2 (&G);
+
+ dst[ itr ].c2.r1 = -1.0f * det * DET2x2 (&B);
+ dst[ itr ].c2.r2 = det * DET2x2 (&E);
+ dst[ itr ].c2.r3 = -1.0f * det * DET2x2 (&H);
+
+ dst[ itr ].c3.r1 = det * DET2x2 (&C);
+ dst[ itr ].c3.r2 = -1.0f * det * DET2x2 (&F);
+ dst[ itr ].c3.r3 = det * DET2x2 (&I);
+ );
+
+#undef aa
+#undef bb
+#undef cc
+#undef dd
+#undef ee
+#undef ff
+#undef gg
+#undef hh
+#undef ii
+}
+
+ne10_result_t invmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count)
+{
+#define aa (src[ itr ].c1.r1)
+#define bb (src[ itr ].c1.r2)
+#define cc (src[ itr ].c1.r3)
+#define dd (src[ itr ].c1.r4)
+
+#define ee (src[ itr ].c2.r1)
+#define ff (src[ itr ].c2.r2)
+#define gg (src[ itr ].c2.r3)
+#define hh (src[ itr ].c2.r4)
+
+#define ii (src[ itr ].c3.r1)
+#define jj (src[ itr ].c3.r2)
+#define kk (src[ itr ].c3.r3)
+#define ll (src[ itr ].c3.r4)
+
+#define mm (src[ itr ].c4.r1)
+#define nn (src[ itr ].c4.r2)
+#define oo (src[ itr ].c4.r3)
+#define pp (src[ itr ].c4.r4)
+
+ ne10_float32_t det = 0.0f;
+ ne10_mat3x3f_t A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P;
+
+ NE10_DETMAT_OPERATION_X_C
+ (
+ det = DET4x4 (&src[ itr ]);
+
+ if (1 == IS_FLOAT_NEAR_ZERO (det))
+ {
+ det = 1.0f;
+ }
+ det = 1.0f / det;
+
+ // Calculate the coefficients
+ createColumnMajorMatrix3x3 (&A, ff, gg, hh, jj, kk, ll, nn, oo, pp);
+ createColumnMajorMatrix3x3 (&B, ee, gg, hh, ii, kk, ll, mm, oo, pp);
+ createColumnMajorMatrix3x3 (&C, ee, ff, hh, ii, jj, ll, mm, nn, pp);
+ createColumnMajorMatrix3x3 (&D, ee, ff, gg, ii, jj, kk, mm, nn, oo);
+ createColumnMajorMatrix3x3 (&E, bb, cc, dd, jj, kk, ll, nn, oo, pp);
+ createColumnMajorMatrix3x3 (&F, aa, cc, dd, ii, kk, ll, mm, oo, pp);
+ createColumnMajorMatrix3x3 (&G, aa, bb, dd, ii, jj, ll, mm, nn, pp);
+ createColumnMajorMatrix3x3 (&H, aa, bb, cc, ii, jj, kk, mm, nn, oo);
+ createColumnMajorMatrix3x3 (&I, bb, cc, dd, ff, gg, hh, nn, oo, pp);
+ createColumnMajorMatrix3x3 (&J, aa, cc, dd, ee, gg, hh, mm, oo, pp);
+ createColumnMajorMatrix3x3 (&K, aa, bb, dd, ee, ff, hh, mm, nn, pp);
+ createColumnMajorMatrix3x3 (&L, aa, bb, cc, ee, ff, gg, mm, nn, oo);
+ createColumnMajorMatrix3x3 (&M, bb, cc, dd, ff, gg, hh, jj, kk, ll);
+ createColumnMajorMatrix3x3 (&N, aa, cc, dd, ee, gg, hh, ii, kk, ll);
+ createColumnMajorMatrix3x3 (&O, aa, bb, dd, ee, ff, hh, ii, jj, ll);
+ createColumnMajorMatrix3x3 (&P, aa, bb, cc, ee, ff, gg, ii, jj, kk);
+
+
+ dst[ itr ].c1.r1 = det * DET3x3 (&A);
+ dst[ itr ].c1.r2 = -1.0f * det * DET3x3 (&E);
+ dst[ itr ].c1.r3 = det * DET3x3 (&I);
+ dst[ itr ].c1.r4 = -1.0f * det * DET3x3 (&M);
+
+ dst[ itr ].c2.r1 = -1.0f * det * DET3x3 (&B);
+ dst[ itr ].c2.r2 = det * DET3x3 (&F);
+ dst[ itr ].c2.r3 = -1.0f * det * DET3x3 (&J);
+ dst[ itr ].c2.r4 = det * DET3x3 (&N);
+
+ dst[ itr ].c3.r1 = det * DET3x3 (&C);
+ dst[ itr ].c3.r2 = -1.0f * det * DET3x3 (&G);
+ dst[ itr ].c3.r3 = det * DET3x3 (&K);
+ dst[ itr ].c3.r4 = -1.0f * det * DET3x3 (&O);
+
+ dst[ itr ].c4.r1 = -1.0f * det * DET3x3 (&D);
+ dst[ itr ].c4.r2 = det * DET3x3 (&H);
+ dst[ itr ].c4.r3 = -1.0f * det * DET3x3 (&L);
+ dst[ itr ].c4.r4 = det * DET3x3 (&P);
+ );
+
+#undef aa
+#undef bb
+#undef cc
+#undef dd
+#undef ee
+#undef ff
+#undef gg
+#undef hh
+#undef ii
+#undef jj
+#undef kk
+#undef ll
+#undef mm
+#undef nn
+#undef oo
+#undef pp
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_invmat.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+.include "NE10_detmat.neon.inc.s"
+
+
+
+
+CONST_FLOAT_ONE:
+ .word 0x3f800000 @ This is the hex value for 1.0f in IEEE-754
+ .word 0x3f800000
+ .word 0x3f800000
+ .word 0x3f800000
+
+CONST_FLOAT_1Em12:
+ .word 0x2B8CBCCC @ This is the hex representation of 1.0e-12 in IEEE-754
+ .word 0x2B8CBCCC @ Any determinant smaller than this value is
+ .word 0x2B8CBCCC @ considered near zero and refused for
+ .word 0x2B8CBCCC @ calculating the inverse of a matrix.
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the inverse of four 2x2 matrices.
+ @ It reads in the matrices from registers q8-q11 and returns
+ @ its results in registers q12-q15
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_INVERSE_2x2MATS
+ @ get the determinant of these four matrices in q15
+ vmul.f32 q15, q8, q11
+ vmls.f32 q15, q9, q10
+
+ @ compare them to find the ones that are too small and set those to 1.0f
+ vacge.f32 q14, q15, q0 @ dst = q14
+
+ vand.f32 q13, q14, q15 @ tmp = q13
+ vbic.s32 q14, q1, q14 @ NOTE: This must be of type S32, the type F32 only negates the sign bits
+ vorr.f32 q14, q14, q13 @ at this point q14 lanes that are too small are set to one and the rest are the determinants
+
+ @ q15 = 1.0f / q14
+ vrecpe.f32 q15, q14
+ vrecps.f32 q14, q15, q14
+ vmul.f32 q14, q14, q15
+
+
+ @ now multiply all the entries with q14 = { 1/det(M1-M4) )
+ vmul.f32 q12, q11, q14
+ vmul.f32 q15, q8, q14
+
+ vneg.f32 q14, q14
+
+ vmul.f32 q13, q9, q14
+ vmul.f32 q14, q10, q14
+
+ .endm
+
+
+
+
+ .align 4
+ .global invmat_2x2f_neon
+ .thumb
+ .thumb_func
+
+invmat_2x2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t invmat_2x2f(arm_mat2x2f_t * dst,
+ @ arm_mat2x2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+
+ adr r4, CONST_FLOAT_1Em12
+ vld1.32 {q0}, [r4]
+ adr r4, CONST_FLOAT_ONE
+ vld1.32 {q1}, [r4]
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat2x2
+
+ @ We load four 2x2 matrices each time, inverse them using the
+ @ provided macro above, and store the four resulting matrices
+ @ back into the memory location pointed to by the first parameter dst (r0)
+
+ @ load the 1st set of values
+ vld4.32 {d16, d18, d20, d22}, [r1]!
+ vld4.32 {d17, d19, d21, d23}, [r1]!
+ subs r2, r2, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ GET_INVERSE_2x2MATS
+
+ ble .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+ @ store the result for the current set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+
+ @ load the next set of values
+ vld4.32 {d16, d18, d20, d22}, [r1]!
+ vld4.32 {d17, d19, d21, d23}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate values for the next set
+ GET_INVERSE_2x2MATS
+
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld4.32 {d16[0], d18[0], d20[0], d22[0]}, [r1]!
+
+ subs r3, r3, #1
+
+ @ calculate values
+ GET_INVERSE_2x2MATS
+
+ @ store the results
+ vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the inverse of two 3x3 matrices.
+ @ It reads in the matrices from registers q0-q5 and returns
+ @ its results in registers q10-q15.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_INVERSE_3x3MATS
+ @ get the determinant of these two matrices in q15
+ GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d6, d8, d10, d1, d3, d5, d16, d9, d11 @ stores the results in d16
+
+ @ compare them to find the ones that are too small and set those to 1.0f
+ vacge.f32 d9, d16, d12 @ dst = d9 - the lanes that are too small are set to all (0)b
+
+ vand.f32 d11, d9, d16 @ tmp = d11
+ vbic.s32 d9, d14, d9 @ NOTE: This must be of type S32, the type F32 only negates the sign bits
+ vorr.f32 d9, d9, d11 @ at this point d9 lanes that are too small are set to one and the rest are the determinants
+
+ @ d16 = 1.0f / d9
+ vrecpe.f32 d16, d9
+ vrecps.f32 d9, d16, d9
+ vmul.f32 d16, d9, d16
+
+ vmov.f32 d17, d16 @ So q8 = { d16={1/det(M1), 1/det(M2)}, d17={1/det(M1), 1/det(M2)} }
+
+ @ get the coefficients in q10 to q15
+ GET_DET_2x2MATS_ARGS d8, d10, d3, d5, d20
+ GET_NEG_DET_2x2MATS_ARGS d6, d10, d1, d5, d26
+ GET_DET_2x2MATS_ARGS d6, d8, d1, d3, d21
+
+ GET_NEG_DET_2x2MATS_ARGS d2, d4, d3, d5, d22
+ GET_DET_2x2MATS_ARGS d0, d4, d1, d5, d28
+ GET_NEG_DET_2x2MATS_ARGS d0, d2, d1, d3, d23
+
+ GET_DET_2x2MATS_ARGS d2, d4, d8, d10, d24
+ GET_NEG_DET_2x2MATS_ARGS d0, d4, d6, d10, d30
+ GET_DET_2x2MATS_ARGS d0, d2, d6, d8, d25
+
+
+
+ @ now multiply all the entries with q8 = { d16={1/det(M1), 1/det(M2)}, d17={1/det(M1), 1/det(M2)} }
+
+ vmul.f32 q10, q10, q8
+ vmul.f32 q11, q11, q8
+ vmul.f32 q12, q12, q8
+
+ vmul.f32 q13, q13, q8
+ vmul.f32 q14, q14, q8
+ vmul.f32 q15, q15, q8
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro stores two 3x3 matrices returned by the above macro
+ @ GET_INVERSE_3x3MATS from registers q10-q15 and into the memory
+ @ address pointed to by the register r0 (dst)
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_3x3INVMATS
+ @ rearrange the results for use in a "vst3" instruction...
+ vtrn.32 q10, q13
+ vtrn.32 q11, q14
+ vtrn.32 q12, q15
+
+ vst3.32 { d20 , d22 , d24 }, [r0]!
+ vst3.32 { d21[0], d23[0], d25[0]}, [r0]!
+ vst3.32 { d26 , d28 , d30 }, [r0]!
+ vst3.32 { d27[0], d29[0], d31[0]}, [r0]!
+ .endm
+
+
+
+
+ .align 4
+ .global invmat_3x3f_neon
+ .thumb
+ .thumb_func
+invmat_3x3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t invmat_3x3f(arm_mat3x3f_t * dst,
+ @ arm_mat3x3f_t * src1,
+ @ arm_mat3x3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ vpush {q4, q5, q6, q7}
+
+ adr r4, CONST_FLOAT_1Em12
+ vld1.32 {q6}, [r4]
+ adr r4, CONST_FLOAT_ONE
+ vld1.32 {q7}, [r4]
+
+ and r3, r2, #3 @ r2 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat3x3
+
+ @ We load two 3x3 matrices each time, inverse them using the
+ @ provided macro above, and store the two resulting matrices
+ @ back into the memory location pointed to by the first parameter dst (r0)
+
+ @ load the 1st set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, q0, q1, q2, q3, q4, q5, r1
+
+ subs r2, r2, #2 @ 2 for this set
+
+ @ calculate values for the 1st set
+ GET_INVERSE_3x3MATS
+
+
+ ble .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+ @ store the result for the current set
+ STORE_3x3INVMATS
+
+ @ load the next set of values
+ LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, q0, q1, q2, q3, q4, q5, r1
+ subs r2, r2, #2
+
+ @ calculate values for the next set
+ GET_INVERSE_3x3MATS
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+ @ the last iteration for this call
+ @ store the result for the last set
+ STORE_3x3INVMATS
+
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+ @ load the next (e.g. 3rd) set of values
+ vld3.32 { d0, d2, d4 }, [r1]!
+ vld3.32 { d1[0], d3[0], d5[0] }, [r1]!
+
+ vtrn.32 q0, q3
+ vtrn.32 q1, q4
+ vtrn.32 q2, q5
+
+ subs r3, r3, #1
+
+ @ calculate values for the last (e.g. 3rd) set
+ GET_INVERSE_3x3MATS
+
+ @ store the result for the last (e.g. 3rd) set
+ vtrn.32 q10, q13
+ vtrn.32 q11, q14
+ vtrn.32 q12, q15
+
+ vst3.32 { d20 , d22 , d24 }, [r0]!
+ vst3.32 { d21[0], d23[0], d25[0]}, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+ vpop {q4, q5, q6, q7}
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the inverse of two 4x4 matrices.
+ @ It reads in the matrices from registers q0-q7 and returns
+ @ its results in registers q8-q15.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro GET_INVERSE_4x4MATS
+ vld1.32 {q10}, [r4]
+ vld1.32 {q11}, [r5]
+
+ @ get the determinant of these two matrices in q15
+ GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15, d30, d28, d26, d31, d29, d27
+
+ @ compare them to find the ones that are too small and set those to 1.0f
+ vacge.f32 d24, d30, d20 @ dst = d24
+
+ vand.f32 d25, d24, d30 @ tmp = d25
+ vbic.s32 d24, d22, d24 @ NOTE: The instruction here must be of type S32, the type F32 only negates the sign bits
+ vorr.f32 d24, d24, d25 @ at this point all d24 lanes that are too small are set to one and the rest are the determinants
+
+ @ d30 = 1.0f (=q1) / d24
+ vrecpe.f32 d30, d24
+ vrecps.f32 d24, d30, d24
+ vmul.f32 d30, d24, d30
+
+ vmov.f32 d31, d30 @ So q15 = { d30={1/det(M1), 1/det(M2)}, d31={1/det(M1), 1/det(M2)} }
+
+
+ @ get the coefficients
+ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d4 , d6 , d8 , d12, d14, d1 , d5 , d7 , d18, d20, d22
+ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d4 , d8 , d10, d12, d1 , d3 , d5 , d19, d20, d22
+
+ GET_DETERMINANT_of_3x3MATS_ARGS d10, d12, d14, d3 , d5 , d7 , d11, d13, d15, d16, d20, d22
+ GET_NEG_DET_3x3MATS_ARGS d8 , d12, d14, d1 , d5 , d7 , d9 , d13, d15, d24, d20, d22
+ GET_DETERMINANT_of_3x3MATS_ARGS d8 , d10, d14, d1 , d3 , d7 , d9 , d11, d15, d17, d20, d22
+ GET_NEG_DET_3x3MATS_ARGS d8 , d10, d12, d1 , d3 , d5 , d9 , d11, d13, d25, d20, d22
+
+ vpush {d16, d17, d18, d19}
+
+ GET_NEG_DET_3x3MATS_ARGS d2 , d4 , d6 , d3 , d5 , d7 , d11, d13, d15, d18, d16, d17
+ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d4 , d6 , d1 , d5 , d7 , d9 , d13, d15, d26, d16, d17
+ GET_NEG_DET_3x3MATS_ARGS d0 , d2 , d6 , d1 , d3 , d7 , d9 , d11, d15, d19, d16, d17
+ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d4 , d1 , d3 , d5 , d9 , d11, d13, d27, d16, d17
+
+ GET_DETERMINANT_of_3x3MATS_ARGS d2 , d4 , d6 , d10, d12, d14, d11, d13, d15, d20, d16, d17
+ GET_NEG_DET_3x3MATS_ARGS d0 , d4 , d6 , d8 , d12, d14, d9 , d13, d15, d28, d16, d17
+ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d6 , d8 , d10, d14, d9 , d11, d15, d21, d16, d17
+ GET_NEG_DET_3x3MATS_ARGS d0 , d2 , d4 , d8 , d10, d12, d9 , d11, d13, d29, d16, d17
+
+ GET_NEG_DET_3x3MATS_ARGS d2 , d4 , d6 , d10, d12, d14, d3 , d5 , d7 , d22, d16, d17
+ @@ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d4 , d6 , d8 , d12, d14, d1 , d5 , d7 , d30, d16, d17 @ This is moved to the top of this section as q15 must remain unchanged
+ GET_NEG_DET_3x3MATS_ARGS d0 , d2 , d6 , d8 , d10, d14, d1 , d3 , d7 , d23, d16, d17
+ @@ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d4 , d8 , d10, d12, d1 , d3 , d5 , d31, d16, d17 @ This is moved to the top of this section as q15 must remain unchanged
+
+ vpop {d16, d17}
+
+ @ now multiply all the entries with q15 = { d30={1/det(M1), 1/det(M2)}, d31={1/det(M1), 1/det(M2)} }
+
+ vmul.f32 q11, q11, q15
+ vmul.f32 q10, q10, q15
+ vmul.f32 q9, q9, q15
+ vmul.f32 q8, q8, q15
+
+ vpop {d0, d1}
+
+ vmul.f32 q12, q12, q15
+ vmul.f32 q13, q13, q15
+ vmul.f32 q14, q14, q15
+ vmul.f32 q15, q0, q15
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro stores two 4x4 matrices returned by the above macro
+ @ GET_INVERSE_4x4MATS from registers q8-q15 and into the memory
+ @ address pointed to by the register r0 (dst)
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_4x4INVMATS
+ @ rearrange the results for use in a "vst4" instruction...
+ vtrn.32 q8, q12
+ vtrn.32 q9, q13
+ vtrn.32 q10, q14
+ vtrn.32 q11, q15
+
+ vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
+ vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
+ vst4.32 { d24 , d26 , d28 , d30 }, [r0]!
+ vst4.32 { d25 , d27 , d29 , d31 }, [r0]!
+ .endm
+
+
+
+
+ .align 4
+ .global invmat_4x4f_neon
+ .thumb
+ .thumb_func
+invmat_4x4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t invmat_4x4f(arm_mat4x4f_t * dst,
+ @ arm_mat4x4f_t * src1,
+ @ arm_mat4x4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5}
+ vpush {q4, q5, q6, q7}
+
+ adr r4, CONST_FLOAT_1Em12
+ adr r5, CONST_FLOAT_ONE
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat4x4
+
+ @ We load two 4x4 matrices each time, inverse them using the
+ @ provided macro above, and store the two resulting matrices
+ @ back into the memory location pointed to by the first parameter dst (r0)
+
+ @ load the 1st set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, q0, q1, q2, q3, q4, q5, q6, q7, r1
+ subs r2, r2, #2 @ two for the first set
+
+ @ calculate values for the 1st set
+ GET_INVERSE_4x4MATS
+
+ ble .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ STORE_4x4INVMATS
+
+ @ load the next (e.g. 3rd) set of values
+ LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, q0, q1, q2, q3, q4, q5, q6, q7, r1
+ subs r2, r2, #2
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ GET_INVERSE_4x4MATS
+
+
+ bgt .L_mainloop_mat4x4 @ loop if r2 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+ @ the last iteration for this call
+ @ store the result for the last set
+ STORE_4x4INVMATS
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ vld4.32 { d0, d2, d4, d6 }, [r1]!
+ vld4.32 { d1, d3, d5, d7 }, [r1]!
+
+ vtrn.32 q0, q4
+ vtrn.32 q1, q5
+ vtrn.32 q2, q6
+ vtrn.32 q3, q7
+
+ subs r3, r3, #1
+ @ calculate values
+ GET_INVERSE_4x4MATS
+
+ @ store the results
+ vtrn.32 q8, q12
+ vtrn.32 q9, q13
+ vtrn.32 q10, q14
+ vtrn.32 q11, q15
+
+ vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
+ vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
+
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ vpop {q4, q5, q6, q7}
+ pop {r4, r5}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_len.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global len_vec2f_asm
+ .thumb
+ .thumb_func
+
+len_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t len_vec2f(arm_float_t * dst,
+ @ arm_vec2f_t * src, unsigned int count)
+ @
+ @ r0: *dst and current destination item's address
+ @ r1: *src and current source item's address
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndVec2F
+ add r0, r0, r2, lsl #2
+ add r1, r1, r2, lsl #3 @ r1 = r1 + count * 8
+
+.LoopBeginVec2F:
+ vldmdb r1!, {s10-s11}
+ vmul.f32 s14, s10, s10 @ s14 = x*x
+ vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
+ vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
+ vstmdb r0!, {s15} @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
+ subs r2, r2, #1 @ decrement the loop counter
+ bne .LoopBeginVec2F @ loop if r4 is still positive or zero
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
+
+
+
+
+ .balign 4
+ .global len_vec3f_asm
+ .thumb
+ .thumb_func
+
+len_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t len_vec3f(arm_float_t * dst,
+ @ arm_vec3f_t * src, unsigned int count)
+ @
+ @ r0: *dst and current destination item's address
+ @ r1: *src and current source item's address
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndVec3F
+ add r0, r0, r2, lsl #2
+ add r1, r1, r2, lsl #3 @ ...
+ add r1, r1, r2, lsl #2 @ r1 = r1 + count * 12
+
+.LoopBeginVec3F:
+ vldmdb r1!, {s10-s12}
+ vmul.f32 s14, s10, s10 @ s14 = x*x
+ vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
+ vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
+ vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
+ vstmdb r0!, {s15} @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
+ subs r2, r2, #1 @ decrement the loop counter
+ bne .LoopBeginVec3F @ loop if r4 is still positive or zero
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
+
+
+
+
+ .balign 4
+ .global len_vec4f_asm
+ .thumb
+ .thumb_func
+
+len_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t len_vec4f(arm_float_t * dst,
+ @ arm_vec4f_t * src, unsigned int count)
+ @
+ @ r0: *dst and current destination item's address
+ @ r1: *src and current source item's address
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndVec4F
+ add r0, r0, r2, lsl #2
+ add r1, r1, r2, lsl #4 @ r1 = r1 + count * 16
+
+.LoopBeginVec4F:
+ vldmdb r1!, {s10-s13}
+ vmul.f32 s14, s10, s10 @ s14 = x*x
+ vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
+ vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
+ vmla.f32 s14, s13, s13 @ s14 = x*x + y*y + z*z + w*w
+ vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
+ vstmdb r0!, {s15} @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
+ subs r2, r2, #1 @ decrement the loop counter
+ bne .LoopBeginVec4F @ loop if r4 is still positive or zero
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_len.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+#include <math.h>
+
+ne10_result_t len_vec2f_c (ne10_float32_t * dst, ne10_vec2f_t * src, ne10_uint32_t count)
+{
+ NE10_LEN_OPERATION_X_C
+ (
+ dst[ itr ] = sqrt (src[ itr ].x * src[ itr ].x +
+ src[ itr ].y * src[ itr ].y) ;
+ );
+}
+
+ne10_result_t len_vec3f_c (ne10_float32_t * dst, ne10_vec3f_t * src, ne10_uint32_t count)
+{
+ NE10_LEN_OPERATION_X_C
+ (
+ dst[ itr ] = sqrt (src[ itr ].x * src[ itr ].x +
+ src[ itr ].y * src[ itr ].y +
+ src[ itr ].z * src[ itr ].z);
+ );
+}
+
+ne10_result_t len_vec4f_c (ne10_float32_t * dst, ne10_vec4f_t * src, ne10_uint32_t count)
+{
+ NE10_LEN_OPERATION_X_C
+ (
+ dst[ itr ] = sqrt (src[ itr ].x * src[ itr ].x +
+ src[ itr ].y * src[ itr ].y +
+ src[ itr ].z * src[ itr ].z +
+ src[ itr ].w * src[ itr ].w);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_len.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .balign 4
+ .global len_vec2f_neon
+ .thumb
+ .thumb_func
+
+len_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t len_vec2f(arm_float_t * dst,
+ @ arm_vec2f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+ cbz r2, .L_check_vec2
+
+
+ @ load values for the first iteration
+ vld2.32 {q0-q1}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate sum of square of the components
+ vmul.f32 q2, q0, q0
+ vmla.f32 q2, q1, q1
+
+ ble .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ subs r2, r2, #4
+
+ @ get SQRT of the last vector while loading a new vector
+ vrsqrte.f32 q3, q2
+ vmul.f32 q4, q2, q3
+ vrsqrts.f32 q4, q4, q3
+ vmul.f32 q4, q3, q4
+
+ vmul.f32 q2, q2, q4
+
+ vst1.32 {q2}, [r0]!
+
+ @ calculate sum of square of the components
+
+ vmul.f32 q2, q0, q0
+ vmla.f32 q2, q1, q1
+
+ bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+ @ the last iteration for this call
+
+ @ get SQRT of the last vector
+ vrsqrte.f32 q3, q2
+ vmul.f32 q4, q2, q3
+ vrsqrts.f32 q4, q4, q3
+ vmul.f32 q4, q3, q4
+
+ vmul.f32 q2, q2, q4
+
+ vst1.32 {q2}, [r0]!
+
+.L_check_vec2:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_vec2
+
+.L_secondloop_vec2:
+ @ process the last few items left in the input array
+ vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
+
+ subs r3, r3, #1
+
+ vmul.f32 d0, d0, d0 @ d0= { V.x^2, V.y^2 };
+ vpadd.f32 d0, d0, d0 @ d0= { V.x^2 + (V.y^2), V.y^2 + (V.x^2) }; // d0 = d0 + (d1^2)
+
+ @ get SQRT of the vector
+ vrsqrte.f32 d2, d0
+ vmul.f32 d1, d0, d2
+ vrsqrts.f32 d1, d1, d2
+ vmul.f32 d1, d2, d1
+
+ vmul.f32 d0, d0, d1
+
+ vst1.32 d0[0], [r0]!
+
+ bgt .L_secondloop_vec2
+
+.L_return_vec2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global len_vec3f_neon
+ .thumb
+ .thumb_func
+len_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t len_vec3f(arm_float_t * dst,
+ @ arm_vec3f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+ cbz r2, .L_check_vec3
+
+
+ @ load values for the first iteration
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate sum of square of the components
+ vmul.f32 q5, q0, q0
+ vmla.f32 q5, q1, q1
+ vmla.f32 q5, q2, q2
+
+ ble .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+ @ load the next set of values
+ vld3.32 {d0,d2,d4}, [r1]!
+ vld3.32 {d1,d3,d5}, [r1]!
+ subs r2, r2, #4
+
+ @ get SQRT of the last vector while loading a new vector
+ vrsqrte.f32 q3, q5
+ vmul.f32 q4, q5, q3
+ vrsqrts.f32 q4, q4, q3
+ vmul.f32 q4, q3, q4
+
+ vmul.f32 q5, q5, q4
+
+ vst1.32 {q5}, [r0]!
+
+ @ calculate sum of square of the components
+ vmul.f32 q5, q0, q0
+ vmla.f32 q5, q1, q1
+ vmla.f32 q5, q2, q2
+
+ bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+ @ the last iteration for this call
+
+ @ get SQRT of the last vector
+ vrsqrte.f32 q3, q5
+ vmul.f32 q4, q5, q3
+ vrsqrts.f32 q4, q4, q3
+ vmul.f32 q4, q3, q4
+
+ vmul.f32 q5, q5, q4
+
+ vst1.32 {q5}, [r0]!
+
+.L_check_vec3:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_vec3
+
+.L_secondloop_vec3:
+ @ process the last few items left in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, -, -, - };
+ @ q1 = { V.y, -, -, - };
+ @ q2 = { V.z, -, -, - };
+ subs r3, r3, #1
+
+ vmul.f32 q0, q0, q0 @ V.x^2
+ vmla.f32 q0, q1, q1 @ V.x^2 + V.y^2
+ vmla.f32 q0, q2, q2 @ V.x^2 + V.y^2 + V.z^2
+
+ @ get SQRT of the vector
+ vrsqrte.f32 q2, q0
+ vmul.f32 q1, q0, q2
+ vrsqrts.f32 q1, q1, q2
+ vmul.f32 q1, q2, q1
+
+ vmul.f32 q0, q0, q1
+
+ vst1.32 d0[0], [r0]!
+
+ bgt .L_secondloop_vec3
+
+.L_return_vec3:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global len_vec4f_neon
+ .thumb
+ .thumb_func
+len_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t len_vec4f(arm_float_t * dst,
+ @ arm_vec4f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+ cbz r2, .L_check_vec4
+
+
+ @ load values for the first iteration
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate sum of square of the components
+ vmul.f32 q5, q0, q0
+ vmla.f32 q5, q1, q1
+ vmla.f32 q5, q2, q2
+ vmla.f32 q5, q3, q3
+
+ ble .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #4
+
+ @ get SQRT of the last vector while loading a new vector
+ vrsqrte.f32 q6, q5
+ vmul.f32 q4, q5, q6
+ vrsqrts.f32 q4, q4, q6
+ vmul.f32 q4, q6, q4
+
+ vmul.f32 q5, q5, q4
+
+ vst1.32 {q5}, [r0]!
+
+ @ calculate sum of square of the components
+ vmul.f32 q5, q0, q0
+ vmla.f32 q5, q1, q1
+ vmla.f32 q5, q2, q2
+ vmla.f32 q5, q3, q3
+
+ bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec4:
+ @ the last iteration for this call
+
+ @ get SQRT of the last vector
+ vrsqrte.f32 q6, q5
+ vmul.f32 q4, q5, q6
+ vrsqrts.f32 q4, q4, q6
+ vmul.f32 q4, q6, q4
+
+ vmul.f32 q5, q5, q4
+
+ vst1.32 {q5}, [r0]!
+
+.L_check_vec4:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_vec4
+
+.L_secondloop_vec4:
+ @ process the last few items left in the input array
+ vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, -, -, - };
+ @ q1 = { V.y, -, -, - };
+ @ q2 = { V.z, -, -, - };
+ subs r3, r3, #1
+
+ vmul.f32 q0, q0, q0 @ V.x^2
+ vmla.f32 q0, q1, q1 @ V.x^2 + V.y^2
+ vmla.f32 q0, q2, q2 @ V.x^2 + V.y^2 + V.z^2
+ vmla.f32 q0, q3, q3 @ V.x^2 + V.y^2 + V.z^2 + V.w^2
+
+ @ get SQRT of the vector
+ vrsqrte.f32 q2, q0
+ vmul.f32 q1, q0, q2
+ vrsqrts.f32 q1, q1, q2
+ vmul.f32 q1, q2, q1
+
+ vmul.f32 q0, q0, q1
+
+ vst1.32 d0[0], [r0]!
+
+ bgt .L_secondloop_vec4
+
+.L_return_vec4:
+ @ return
+ mov r0, #0
+ bx lr
+
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mla.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global mla_float_asm
+ .thumb
+ .thumb_func
+
+mla_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mla_float(arm_vec2f_t * dst, arm_float_t * acc,
+ @ arm_float_t * src1, const arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current src1 entry's address - made of base(r0)+offset
+ @ r1: *acc & current acc entry's address - made of base(r1)+offset
+ @ r2: *src1 & current src1 entry's address - made of base(r2)+offset
+ @ r3: *src2 & current src2 entry's address - made of base(r3)+offset
+ @ r4: int count
+ @
+ @ r4: loop counter
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ ldr r4, [r13, #4] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+ cbz r4, .LoopEndFloat
+
+.LoopBeginFloat:
+ vldr s10, [r1] @ Load s10 = acc[i]
+ vldr s1, [r2] @ Load s1 = src1[i]
+ vldr s2, [r3] @ Load s2 = src2[i]
+ add r1, r1, #4 @ move to the next acc entry
+ add r2, r2, #4 @ move to the next src1 entry
+ add r3, r3, #4 @ next entry in src2
+ vmla.f32 s10, s1, s2 @ s10 = acc[i] + (src1[i] * src2[i])
+ vstr s10, [r0] @ Store the result back into the main memory
+ add r0, r0, #4 @ next entry in the dst
+ subs r4, r4, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mla.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t mla_float_c (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ] = acc[ itr ] + (src1[ itr ] * src2[ itr ]);
+ );
+}
+
+ne10_result_t vmla_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = acc[ itr ].x + (src1[ itr ].x * src2[ itr ].x);
+ dst[ itr ].y = acc[ itr ].y + (src1[ itr ].y * src2[ itr ].y);
+ );
+}
+
+ne10_result_t vmla_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = acc[ itr ].x + (src1[ itr ].x * src2[ itr ].x);
+ dst[ itr ].y = acc[ itr ].y + (src1[ itr ].y * src2[ itr ].y);
+ dst[ itr ].z = acc[ itr ].z + (src1[ itr ].z * src2[ itr ].z);
+ );
+}
+
+ne10_result_t vmla_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = acc[ itr ].x + (src1[ itr ].x * src2[ itr ].x);
+ dst[ itr ].y = acc[ itr ].y + (src1[ itr ].y * src2[ itr ].y);
+ dst[ itr ].z = acc[ itr ].z + (src1[ itr ].z * src2[ itr ].z);
+ dst[ itr ].w = acc[ itr ].w + (src1[ itr ].w * src2[ itr ].w);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mla.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .balign 4
+ .global mla_float_neon
+ .thumb
+ .thumb_func
+
+mla_float_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mla_float(arm_float_t * dst,
+ @ arm_float_t * acc,
+ @ arm_float_t * src1,
+ @ arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *acc & current acc entry's address
+ @ r2: *src1 & current src1 entry's address
+ @ r3: *src2 & current src2 entry's address
+ @ r4: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r5: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5}
+ ldr r4, [r13, #8] @ r4 = count; r13 is the stack pointer (sp)
+
+ and r5, r4, #3 @ r5 = count % 4; ; This is what's left to be processed after this loop
+ sub r4, r4, r5 @ count = count - r5
+
+ cbz r4, .L_check_float
+
+ @ load the 1st set of values
+ vld1.32 {q0}, [r2]!
+ vld1.32 {q1}, [r3]!
+ vld1.32 {q3}, [r1]!
+ subs r4, r4, #4
+
+ @ calculate values for the 1st set
+ vmla.f32 q3, q0, q1 @ q3 += q0 * q1
+
+ ble .L_mainloopend_float
+
+.L_mainloop_float:
+ @ load the next (e.g. 2nd) set of values, leave loading acc until later
+ vld1.32 {q0}, [r2]!
+ vld1.32 {q1}, [r3]!
+
+ @ store the result for the 1st/next (e.g. 2nd) set
+ vst1.32 {d6,d7}, [r0]!
+
+ @ load the next (e.g. 2nd) acc, and decrease the counter
+ vld1.32 {q3}, [r1]!
+ subs r4, r4, #4
+
+ @ calculate values for the next (e.g. 2nd) set
+ vmla.f32 q3, q0, q1 @ q3 += q0 * q1
+
+ bgt .L_mainloop_float @ loop if r4 > 0, if we have at least another 4 floats
+
+.L_mainloopend_float:
+ @ the last iteration for this call
+ @ store the result for the last set of values (e.g 2nd set)
+ vst1.32 {d6,d7}, [r0]!
+
+.L_check_float:
+ @ check if anything left to process at the end of the input array
+ cmp r5, #0
+ ble .L_return_float
+
+.L_secondloop_float:
+ @ process the last few items left in the input array
+ vld1.f32 d0[0], [r2]! @ Fill in d0[0]
+ vld1.f32 d1[0], [r3]! @ Fill in d1[0]
+ vld1.f32 d2[0], [r1]! @ Fill in d2[0]
+
+ subs r5, r5, #1
+
+ @ values
+ vmla.f32 d2, d0, d1
+
+ vst1.32 {d2[0]}, [r0]!
+
+ bgt .L_secondloop_float
+
+.L_return_float:
+ @ return
+ pop {r4, r5}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .balign 4
+ .global vmla_vec2f_neon
+ .thumb
+ .thumb_func
+
+vmla_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t vmla_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * acc,
+ @ arm_vec2f_t * src1,
+ @ arm_vec2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *acc & current acc entry's address
+ @ r2: *src1 & current src1 entry's address
+ @ r3: *src2 & current src2 entry's address
+ @ r4: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r5: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5}
+ ldr r4, [r13, #8] @ r5 = count; r13 is the stack pointer (sp)
+
+ and r5, r4, #3 @ r5 = count % 4;
+ sub r4, r4, r5 @ count = count - r4; This is what's left to be processed after this loop
+
+ cbz r4, .L_check_vec2
+
+ @ load the 1st set of values
+ vld2.32 {q0-q1}, [r2]!
+ vld2.32 {q2-q3}, [r3]!
+ vld2.32 {q8-q9}, [r1]!
+ subs r4, r4, #4
+
+ @ calculate values for the 1st set
+ vmla.f32 q8, q0, q2
+ vmla.f32 q9, q1, q3
+
+ ble .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+ @ load the 2nd set of values
+ vld2.32 {q0-q1}, [r2]!
+ vld2.32 {q2-q3}, [r3]!
+
+ @ store the result for the 1st/next (e.g. 2nd) set
+ vst2.32 {d16,d17,d18,d19}, [r0]!
+
+ @ load the next (e.g. 2nd) set of values
+ vld2.32 {q8-q9}, [r1]!
+ subs r4, r4, #4
+
+ @ calculate values for the 2nd set
+ vmla.f32 q8, q0, q2
+ vmla.f32 q9, q1, q3
+
+ bgt .L_mainloop_vec2 @ loop if r3 is > r4, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+ @ the last iteration for this call
+ @ store the result for the last set of values
+ vst2.32 {d16,d17,d18,d19}, [r0]!
+
+.L_check_vec2:
+ @ check if anything left to process at the end of the input array
+ cmp r5, #0
+ ble .L_return_vec2
+
+.L_secondloop_vec2:
+ @ process the last few items left in the input array
+ vld1.f32 d0, [r2]!
+ vld1.f32 d1, [r3]!
+ vld1.f32 d2, [r1]!
+
+ subs r5, r5, #1
+
+ @ calculate values
+ vmla.f32 d2, d0, d1
+
+ vst1.32 {d2}, [r0]!
+
+ bgt .L_secondloop_vec2
+
+.L_return_vec2:
+ @ return
+ pop {r4, r5}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global vmla_vec3f_neon
+ .thumb
+ .thumb_func
+vmla_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t vmla_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * acc,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *acc & current acc entry's address
+ @ r2: *src1 & current src1 entry's address
+ @ r3: *src2 & current src2 entry's address
+ @ r4: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r5: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5}
+ ldr r4, [r13, #8] @ r4 = count; r13 is the stack pointer (sp)
+
+ and r5, r4, #3 @ r4 = count % 4;
+ sub r4, r4, r5 @ count = count - r4; This is what's left to be processed after this loop
+
+ cmp r4, #0
+ beq .L_check_vec3
+
+ @ load the 1st set of values
+ vld3.32 {d0, d2, d4}, [r2]!
+ vld3.32 {d1, d3, d5}, [r2]!
+ vld3.32 {d18, d20, d22}, [r3]!
+ vld3.32 {d19, d21, d23}, [r3]!
+ vld3.32 {d24, d26, d28}, [r1]! @ part of q12, q13, and q14
+ vld3.32 {d25, d27, d29}, [r1]! @ part of q12, q13, and q14
+ subs r4, r4, #4
+
+ @ calculate values for the 1st set
+ vmla.f32 q12, q0, q9
+ vmla.f32 q13, q1, q10
+ vmla.f32 q14, q2, q11
+
+ ble .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+ @ load the next (e.g. 2nd) set of values
+ vld3.32 {d0, d2, d4}, [r2]!
+ vld3.32 {d1, d3, d5}, [r2]!
+ vld3.32 {d18, d20, d22}, [r3]!
+ vld3.32 {d19, d21, d23}, [r3]!
+
+ @ store the result for the 1st/next (e.g. 2nd) set
+ vst3.32 {d24, d26, d28}, [r0]!
+ vst3.32 {d25, d27, d29}, [r0]!
+
+ @ finish loading ...
+ vld3.32 {d24, d26, d28}, [r1]! @ part of q12, q13, and q14
+ vld3.32 {d25, d27, d29}, [r1]! @ part of q12, q13, and q14
+ subs r4, r4, #4
+
+ @ calculate values for the next (e.g. 2nd) set
+ vmla.f32 q12, q0, q9
+ vmla.f32 q13, q1, q10
+ vmla.f32 q14, q2, q11
+
+ bgt .L_mainloop_vec3 @ loop if r3 is > r4, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+ @ the last iteration for this call
+ @ store the result for the last set of value
+ vst3.32 {d24, d26, d28}, [r0]!
+ vst3.32 {d25, d27, d29}, [r0]!
+
+.L_check_vec3:
+ @ check if anything left to process at the end of the input array
+ cmp r5, #0
+ ble .L_return_vec3
+
+.L_secondloop_vec3:
+ @ process the last few items left in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r3]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+ vld3.f32 {d18[0], d20[0], d22[0]}, [r1]! @ The values are loaded like so:
+ @ q9 = { acc.x, -, -, - };
+ @ q10 = { acc.y, -, -, - };
+ @ q11 = { acc.z, -, -, - };
+
+ subs r5, r5, #1
+
+ @ calculate values for
+ vmla.f32 d18, d0, d1
+ vmla.f32 d20, d2, d3
+ vmla.f32 d22, d4, d5
+
+ vst3.32 {d18[0], d20[0], d22[0]}, [r0]!
+
+ bgt .L_secondloop_vec3
+
+.L_return_vec3:
+ @ return
+ pop {r4, r5}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global vmla_vec4f_neon
+ .thumb
+ .thumb_func
+vmla_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t vmla_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * acc,
+ @ arm_vec4f_t * src1,
+ @ arm_vec4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *acc & current acc entry's address
+ @ r2: *src1 & current src1 entry's address
+ @ r3: *src2 & current src2 entry's address
+ @ r4: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r5: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5}
+ ldr r4, [r13, #8] @ r4 = count; r13 is the stack pointer (sp)
+
+ and r5, r4, #3 @ r5 = count % 4;
+ sub r4, r4, r5 @ count = count - r5; This is what's left to be processed after this loop
+
+ cmp r4, #0
+ beq .L_check_vec4
+
+ @ load the 1st set of values
+ vld4.32 {d0, d2, d4, d6}, [r2]!
+ vld4.32 {d1, d3, d5, d7}, [r2]!
+ vld4.32 {d16, d18, d20, d22}, [r3]!
+ vld4.32 {d17, d19, d21, d23}, [r3]!
+ vld4.32 {d24, d26, d28, d30}, [r1]! @ part of q12, q13, q14, and q15
+ vld4.32 {d25, d27, d29, d31}, [r1]! @ part of q12, q13, q14, and q15
+ subs r4, r4, #4
+
+ @ calculate values for the 1st set
+ vmla.f32 q12, q0, q8
+ vmla.f32 q13, q1, q9
+ vmla.f32 q14, q2, q10
+ vmla.f32 q15, q3, q11
+
+ ble .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+ @ load the next (e.g. 2nd) set of values
+ vld4.32 {d0, d2, d4, d6}, [r2]!
+ vld4.32 {d1, d3, d5, d7}, [r2]!
+ vld4.32 {d16, d18, d20, d22}, [r3]!
+ vld4.32 {d17, d19, d21, d23}, [r3]!
+
+ @ store the result for the 1st/next (e.g. 2nd) set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+
+ @ finish loading ....
+ vld4.32 {d24, d26, d28, d30}, [r1]! @ part of q12, q13, q14, and q15
+ vld4.32 {d25, d27, d29, d31}, [r1]! @ part of q12, q13, q14, and q15
+ subs r4, r4, #4
+
+ @ calculate values for the next (e.g. 2nd) set
+ vmla.f32 q12, q0, q8
+ vmla.f32 q13, q1, q9
+ vmla.f32 q14, q2, q10
+ vmla.f32 q15, q3, q11
+
+ bgt .L_mainloop_vec4 @ loop if r3 is > r4, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_vec4:
+ @ the last iteration for this call
+ @ store the result for the last set of values
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+
+.L_check_vec4:
+ @ check if anything left to process at the end of the input array
+ cmp r5, #0
+ ble .L_return_vec4
+
+.L_secondloop_vec4:
+ @ process the last few items left in the input array
+ vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ @ q3 = { V1.w, -, -, - };
+ vld4.f32 {d1[0], d3[0], d5[0], d7[0]}, [r3]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+ @ q3 = { V1.w, -, V2.w, - };
+ vld4.f32 {d24[0], d26[0], d28[0], d30[0]}, [r1]! @ The values are loaded like so:
+ @ q12 = { acc.x, -, -, - };
+ @ q13 = { acc.y, -, -, - };
+ @ q14 = { acc.z, -, -, - };
+ @ q15 = { acc.w, -, -, - };
+
+ subs r5, r5, #1
+
+ @ calculate values
+ vmla.f32 d24, d0, d1
+ vmla.f32 d26, d2, d3
+ vmla.f32 d28, d4, d5
+ vmla.f32 d30, d6, d7
+
+ vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0]!
+
+ bgt .L_secondloop_vec4
+
+.L_return_vec4:
+ @ return
+ pop {r4, r5}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mlac.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global mlac_float_asm
+ .thumb
+ .thumb_func
+
+mlac_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mlac_float(arm_vec2f_t * dst, arm_vec2f_t * acc,
+ @ arm_float_t * src, const arm_float_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *acc
+ @ r2: *src
+ @ r3: cst
+ @ r4: int count
+ @
+ @ r4: loop counter
+ @ r5: current item's offset in acc[], src[], and dst[]
+ @ r6: current accumulator item's address made of base(r1)+offset(r5)
+ @ r7: current source item's address made of base(r2)+offset(r5)
+ @ r8: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7, r8}
+ ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+ cbz r4, .LoopEndFloat
+ mov r5, #0
+
+.LoopBeginFloat:
+ add r6, r1, r5 @ Get current accumulator item's address in memory
+ vldr s10, [r6, #0] @ Load acc[i]
+ add r7, r2, r5 @ Get current source item's address in memory
+ vldr s2, [r7, #0] @ Load src[i]
+ vmov s3, r3 @ Get cst into register s3
+ vmla.f32 s10, s2, s3 @ s10 = acc[i] + ( src[i] * cst )
+ add r8, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r8, #0] @ Store the result back into the main memory
+ add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
+ subs r4, r4, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7, r8}
+ bx lr
+
+
+
+
+ .balign 4
+ .global mlac_vec2f_asm
+ .thumb
+ .thumb_func
+
+mlac_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mlac_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *acc
+ @ r2: *src
+ @ r3: *cst
+ @ r4: int count
+ @
+ @ r4: loop counter
+ @ r5: current item's offset in acc[], src[], and dst[]
+ @ r6: current accumulator item's address made of base(r1)+offset(r5)
+ @ r7: current source item's address made of base(r2)+offset(r5)
+ @ r8: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7, r8}
+ ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+ cbz r4, .LoopEndVec2F
+ mov r5, #0
+
+.LoopBeginVec2F:
+ add r6, r1, r5 @ Get current accumulator item's address in memory
+ vldr s10, [r6, #0] @ Load acc[i].x and acc[i].y
+ vldr s11, [r6, #4]
+ add r7, r2, r5 @ Get current source item's address in memory
+ vldr s1, [r7, #0] @ Load src[i].x and src[i].y
+ vldr s2, [r7, #4]
+ vldr s3, [r3, #0] @ Load cst->x and cst->y
+ vldr s4, [r3, #4]
+ vmla.f32 s10, s1, s3 @ s10 = acc[i].x + ( src[i].x * cst->x )
+ vmla.f32 s11, s2, s4
+ add r8, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r8, #0] @ Store the results back into the main memory
+ vstr s11, [r8, #4]
+ add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
+ subs r4, r4, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7, r8}
+ bx lr
+
+
+
+
+ .balign 4
+ .global mlac_vec3f_asm
+ .thumb
+ .thumb_func
+
+mlac_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mlac_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *acc
+ @ r2: *src
+ @ r3: *cst
+ @ r4: int count
+ @
+ @ r4: loop counter
+ @ r5: current item's offset in acc[], src[], and dst[]
+ @ r6: current accumulator item's address made of base(r1)+offset(r5)
+ @ r7: current source item's address made of base(r2)+offset(r5)
+ @ r8: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7, r8}
+ ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+ cbz r4, .LoopEndVec3F
+ mov r5, #0
+
+.LoopBeginVec3F:
+ add r6, r1, r5 @ Get current accumulator item's address in memory
+ vldr s10, [r6, #0] @ Load acc[i].x, acc[i].y , and acc[i].z
+ vldr s11, [r6, #4]
+ vldr s12, [r6, #8]
+ add r7, r2, r5 @ Get current source item's address in memory
+ vldr s1, [r7, #0] @ Load src[i].x, src[i].y , and src[i].z
+ vldr s2, [r7, #4]
+ vldr s3, [r7, #8]
+ vldr s4, [r3, #0] @ Load cst->x, cst->y, and cst->z
+ vldr s5, [r3, #4]
+ vldr s6, [r3, #8]
+ vmla.f32 s10, s1, s4 @ s10 = acc[i].x + ( src[i].x * cst->x )
+ vmla.f32 s11, s2, s5 @ same for 'y'
+ vmla.f32 s12, s3, s6 @ same for 'z'
+ add r8, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r8, #0] @ Store the results back into the main memory
+ vstr s11, [r8, #4]
+ vstr s12, [r8, #8]
+ add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+ subs r4, r4, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7, r8}
+ bx lr
+
+
+
+
+ .balign 4
+ .global mlac_vec4f_asm
+ .thumb
+ .thumb_func
+
+mlac_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mlac_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *acc
+ @ r2: *src
+ @ r3: *cst
+ @ r4: int count
+ @
+ @ r4: loop counter
+ @ r5: current item's offset in acc[], src[], and dst[]
+ @ r6: current accumulator item's address made of base(r1)+offset(r5)
+ @ r7: current source item's address made of base(r2)+offset(r5)
+ @ r8: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7, r8}
+ ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
+ cbz r4, .LoopEndVec4F
+ mov r5, #0
+
+.LoopBeginVec4F:
+ add r6, r1, r5 @ Get current accumulator item's address in memory
+ vldr s10, [r6, #0] @ Load acc[i].x, acc[i].y , acc[i].z, and w
+ vldr s11, [r6, #4]
+ vldr s12, [r6, #8]
+ vldr s13, [r6, #12]
+ add r7, r2, r5 @ Get current source item's address in memory
+ vldr s1, [r7, #0] @ Load src[i].x, src[i].y , src[i].z, and w
+ vldr s2, [r7, #4]
+ vldr s3, [r7, #8]
+ vldr s4, [r7, #12]
+ vldr s5, [r3, #0] @ Load cst->x, cst->y, cst->z, and w
+ vldr s6, [r3, #4]
+ vldr s7, [r3, #8]
+ vldr s8, [r3, #12]
+ vmla.f32 s10, s1, s5 @ s10 = acc[i].x + ( src[i].x * cst->x )
+ vmla.f32 s11, s2, s6 @ same for 'y'
+ vmla.f32 s12, s3, s7 @ same for 'z'
+ vmla.f32 s13, s4, s8 @ same for 'w'
+ add r8, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r8, #0] @ Store the results back into the main memory
+ vstr s11, [r8, #4]
+ vstr s12, [r8, #8]
+ vstr s13, [r8, #12]
+ add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+ subs r4, r4, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7, r8}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mlac.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t mlac_float_c (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_X_C
+ (
+ dst[ itr ] = acc[ itr ] + (src[ itr ] * cst);
+ );
+}
+
+ne10_result_t mlac_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_X_C
+ (
+ dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
+ dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
+ );
+}
+
+ne10_result_t mlac_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_X_C
+ (
+ dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
+ dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
+ dst[ itr ].z = acc[ itr ].z + (src[ itr ].z * cst->z);
+ );
+}
+
+ne10_result_t mlac_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_X_C
+ (
+ dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
+ dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
+ dst[ itr ].z = acc[ itr ].z + (src[ itr ].z * cst->z);
+ dst[ itr ].w = acc[ itr ].w + (src[ itr ].w * cst->w);
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mlac.neon.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+ne10_result_t mlac_float_neon (ne10_float32_t * dst, ne10_float32_t * acc, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_FLOAT_NEON
+ (
+ n_dst = vmlaq_f32 (n_acc, n_src, n_cst);
+ ,
+ n_tmp_src = vmla_f32 (n_tmp_acc, n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t mlac_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * acc, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_VEC2F_NEON
+ (
+ n_dst = vmlaq_f32 (n_acc, n_src , n_cst);
+ ,
+ n_tmp_src = vmla_f32 (n_tmp_acc, n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t mlac_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * acc, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_VEC3F_NEON
+ (
+ n_dst1 = vmlaq_f32 (n_acc1, n_src1 , n_cst1);
+ n_dst2 = vmlaq_f32 (n_acc2, n_src2 , n_cst2);
+ n_dst3 = vmlaq_f32 (n_acc3, n_src3 , n_cst3);
+ ,
+ n_tmp_src.val[0] = vmla_f32 (n_tmp_acc.val[0], n_tmp_src.val[0], n_tmp_cst.val[0]); /* the X lane */
+ n_tmp_src.val[1] = vmla_f32 (n_tmp_acc.val[1], n_tmp_src.val[1], n_tmp_cst.val[1]); /* the Y lane */
+ n_tmp_src.val[2] = vmla_f32 (n_tmp_acc.val[2], n_tmp_src.val[2], n_tmp_cst.val[2]); /* the Z lane */
+ );
+}
+
+ne10_result_t mlac_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * acc, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_MLAC_OPERATION_VEC4F_NEON
+ (
+ n_dst = vmlaq_f32 (n_acc, n_src , n_cst);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mul.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global mul_float_asm
+ .thumb
+ .thumb_func
+
+mul_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mul_float(arm_vec2f_t * dst,
+ @ arm_float_t * src1, const arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+ @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+ @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r3, .LoopEndFloat
+
+.LoopBeginFloat:
+ vldr s1, [r1] @ Load s1 = src1[i]
+ add r1, r1, #4 @ move to the next entry
+ vldr s2, [r2] @ Load s2 = src2[i]
+ add r2, r2, #4 @ next entry
+ vmul.f32 s10, s1, s2 @ s10 = src1[i] * src2[i]
+ vstr s10, [r0] @ Store the result back into the main memory
+ add r0, r0, #4 @ next entry in the dst
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mul.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t mul_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ] = src1[ itr ] * src2[ itr ];
+ );
+}
+
+ne10_result_t vmul_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
+ );
+}
+
+ne10_result_t vmul_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z * src2[ itr ].z;
+ );
+}
+
+ne10_result_t vmul_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z * src2[ itr ].z;
+ dst[ itr ].w = src1[ itr ].w * src2[ itr ].w;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mul.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global mul_float_neon
+ .thumb
+ .thumb_func
+
+mul_float_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mul_float(arm_float_t * dst,
+ @ arm_float_t * src1,
+ @ arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ cbz r3, .L_check_float
+
+ @ load the current set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the current set
+ vmul.f32 q3, q0, q1 @ q3 = q0 * q1
+
+ ble .L_mainloopend_float
+
+.L_mainloop_float:
+ @ store the result for the current set
+ vst1.32 {d6,d7}, [r0]!
+
+ @ load the next set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vmul.f32 q3, q0, q1 @ q3 = q0 * q1
+
+ bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
+
+.L_mainloopend_float:
+ @ the last iteration for this call
+ @ store the result for the last one
+ vst1.32 {d6,d7}, [r0]!
+
+.L_check_float:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_float
+
+.L_secondloop_float:
+ @ process the last few items left in the input array
+ vld1.f32 d0[0], [r1]! @ Fill in d0[0]
+ vld1.f32 d1[0], [r2]! @ Fill in d1[1]
+
+ subs r4, r4, #1
+
+ @ values
+ vmul.f32 d0, d0, d1
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_secondloop_float
+
+.L_return_float:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global vmul_vec2f_neon
+ .thumb
+ .thumb_func
+
+vmul_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mul_float(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src1,
+ @ arm_vec2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cbz r3, .L_check_vec2
+
+ @ load the 1st set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ vmul.f32 q4, q0, q2
+ vmul.f32 q5, q1, q3
+
+ ble .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+ @ store the result for the current set
+ vst2.32 {d8,d9,d10,d11}, [r0]!
+
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vmul.f32 q4, q0, q2
+ vmul.f32 q5, q1, q3
+
+ bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst2.32 {d8,d9,d10,d11}, [r0]!
+
+.L_check_vec2:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_vec2
+
+.L_secondloop_vec2:
+ @ process the last few items left in the input array
+ vld1.f32 d0, [r1]!
+ vld1.f32 d1, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vmul.f32 d0, d0, d1
+
+ vst1.32 {d0}, [r0]!
+
+ bgt .L_secondloop_vec2
+
+.L_return_vec2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global vmul_vec3f_neon
+ .thumb
+ .thumb_func
+vmul_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mul_float(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r3 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_vec3
+
+ @ load the 1st set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d6, d8, d10}, [r2]!
+ vld3.32 {d7, d9, d11}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the 1st set
+ vmul.f32 q10, q0, q3
+ vmul.f32 q11, q1, q4
+ vmul.f32 q12, q2, q5
+
+ ble .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+ @ store the result for the current set
+ vst3.32 {d20, d22, d24}, [r0]!
+ vst3.32 {d21, d23, d25}, [r0]!
+
+ @ load the next set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d6, d8, d10}, [r2]!
+ vld3.32 {d7, d9, d11}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vmul.f32 q10, q0, q3
+ vmul.f32 q11, q1, q4
+ vmul.f32 q12, q2, q5
+
+ bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst3.32 {d20, d22, d24}, [r0]!
+ vst3.32 {d21, d23, d25}, [r0]!
+
+.L_check_vec3:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_vec3
+
+.L_secondloop_vec3:
+ @ process the last few items left in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+
+ subs r4, r4, #1
+
+ @ calculate values for
+ vmul.f32 d0, d0, d1
+ vmul.f32 d2, d2, d3
+ vmul.f32 d4, d4, d5
+
+ vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
+
+ bgt .L_secondloop_vec3
+
+.L_return_vec3:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global vmul_vec4f_neon
+ .thumb
+ .thumb_func
+vmul_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mul_float(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src1,
+ @ arm_vec4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_vec4
+
+ @ load the 1st set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d8, d10, d12, d14}, [r2]!
+ vld4.32 {d9, d11, d13, d15}, [r2]!
+
+ subs r3, r3, #4
+
+ @ calculate values for the 1st set
+ vmul.f32 q10, q0, q4
+ vmul.f32 q11, q1, q5
+ vmul.f32 q12, q2, q6
+ vmul.f32 q13, q3, q7
+
+ ble .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+ @ store the result for current set
+ vst4.32 {d20, d22, d24, d26}, [r0]!
+ vst4.32 {d21, d23, d25, d27}, [r0]!
+
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d8, d10, d12, d14}, [r2]!
+ vld4.32 {d9, d11, d13, d15}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vmul.f32 q10, q0, q4
+ vmul.f32 q11, q1, q5
+ vmul.f32 q12, q2, q6
+ vmul.f32 q13, q3, q7
+
+ bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_vec4:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst4.32 {d20, d22, d24, d26}, [r0]!
+ vst4.32 {d21, d23, d25, d27}, [r0]!
+
+.L_check_vec4:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_vec4
+
+.L_secondloop_vec4:
+ @ process the last few items left in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, V1.y, V1.z, V1.w };
+ vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
+ @ q1 = { V2.x, V2.y, V2.z, V2.w };
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vmul.f32 q0, q0, q1
+
+ vst1.32 {d0, d1}, [r0]!
+
+ bgt .L_secondloop_vec4
+
+.L_return_vec4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mulc.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global mulc_float_asm
+ .thumb
+ .thumb_func
+
+mulc_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulc_float(arm_vec2f_t * dst,
+ @ arm_float_t * src, const arm_float_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndFloat
+ mov r5, #0
+
+.LoopBeginFloat:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i]
+ vmov s3, r2 @ Get cst into register s3
+ vmul.f32 s10, s1, s3 @ s10 = src[i] * cst
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the result back into the main memory
+ add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global mulc_vec2f_asm
+ .thumb
+ .thumb_func
+
+mulc_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulc_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec2F
+ mov r5, #0
+
+.LoopBeginVec2F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x and src[i].y
+ vldr s2, [r6, #4]
+ vldr s3, [r2, #0] @ Load cst->x and cst->y
+ vldr s4, [r2, #4]
+ vmul.f32 s10, s1, s3 @ s10 = src[i].x * cst->x
+ vmul.f32 s11, s2, s4 @ s11 = src[i].y * cst->y
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global mulc_vec3f_asm
+ .thumb
+ .thumb_func
+
+mulc_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulc_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec3F
+ mov r5, #0
+
+.LoopBeginVec3F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
+ vldr s5, [r2, #4]
+ vldr s6, [r2, #8]
+ vmul.f32 s10, s1, s4 @ s10 = src[i].x * cst->x
+ vmul.f32 s11, s2, s5 @ s11 = src[i].y * cst->y
+ vmul.f32 s12, s3, s6 @ s12 = src[i].z * cst->z
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global mulc_vec4f_asm
+ .thumb
+ .thumb_func
+
+mulc_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulc_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec4F
+ mov r5, #0
+
+.LoopBeginVec4F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r6, #12]
+ vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
+ vldr s6, [r2, #4]
+ vldr s7, [r2, #8]
+ vldr s8, [r2, #12]
+ vmul.f32 s10, s1, s5 @ s10 = src[i].x * cst->x
+ vmul.f32 s11, s2, s6 @ s11 = src[i].y * cst->y
+ vmul.f32 s12, s3, s7 @ s12 = src[i].z * cst->z
+ vmul.f32 s13, s4, s8 @ s13 = src[i].w * cst->w
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ vstr s13, [r7, #12]
+ add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mulc.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t mulc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ] = src[ itr ] * cst;
+ );
+}
+
+ne10_result_t mulc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x * cst->x;
+ dst[ itr ].y = src[ itr ].y * cst->y;
+ );
+}
+
+ne10_result_t mulc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x * cst->x;
+ dst[ itr ].y = src[ itr ].y * cst->y;
+ dst[ itr ].z = src[ itr ].z * cst->z;
+ );
+}
+
+ne10_result_t mulc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x * cst->x;
+ dst[ itr ].y = src[ itr ].y * cst->y;
+ dst[ itr ].z = src[ itr ].z * cst->z;
+ dst[ itr ].w = src[ itr ].w * cst->w;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mulc.neon.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+ne10_result_t mulc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_FLOAT_NEON
+ (
+ n_dst = vmulq_f32 (n_src , n_cst);
+ ,
+ n_tmp_src = vmul_f32 (n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t mulc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC2F_NEON
+ (
+ n_dst = vmulq_f32 (n_src , n_cst);
+ ,
+ n_tmp_src = vmul_f32 (n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t mulc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC3F_NEON
+ (
+ n_dst1 = vmulq_f32 (n_src1 , n_cst1);
+ n_dst2 = vmulq_f32 (n_src2 , n_cst2);
+ n_dst3 = vmulq_f32 (n_src3 , n_cst3);
+ ,
+ n_tmp_src.val[0] = vmul_f32 (n_tmp_src.val[0], n_tmp_cst.val[0]);
+ n_tmp_src.val[1] = vmul_f32 (n_tmp_src.val[1], n_tmp_cst.val[1]);
+ n_tmp_src.val[2] = vmul_f32 (n_tmp_src.val[2], n_tmp_cst.val[2]);
+ );
+}
+
+ne10_result_t mulc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC4F_NEON
+ (
+ n_dst = vmulq_f32 (n_src , n_cst);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mulcmatvec.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_mulcmatvec.neon.s
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t mulcmatvec_cm2x2f_v2f_c (ne10_vec2f_t * dst, const ne10_mat2x2f_t * cst, ne10_vec2f_t * src, ne10_uint32_t count)
+{
+#define A1 cst->c1.r1
+#define B1 cst->c1.r2
+#define C1 cst->c2.r1
+#define D1 cst->c2.r2
+
+ NE10_CMATVEC_OPERATION_X_C
+ (
+ dst[ itr ].x = A1 * src[ itr ].x + C1 * src[ itr ].y;
+ dst[ itr ].y = B1 * src[ itr ].x + D1 * src[ itr ].y;
+ );
+
+#undef A1
+#undef B1
+#undef C1
+#undef D1
+}
+
+ne10_result_t mulcmatvec_cm3x3f_v3f_c (ne10_vec3f_t * dst, const ne10_mat3x3f_t * cst, ne10_vec3f_t * src, ne10_uint32_t count)
+{
+#define A1 cst->c1.r1
+#define B1 cst->c1.r2
+#define C1 cst->c1.r3
+#define D1 cst->c2.r1
+#define E1 cst->c2.r2
+#define F1 cst->c2.r3
+#define G1 cst->c3.r1
+#define H1 cst->c3.r2
+#define I1 cst->c3.r3
+
+ NE10_CMATVEC_OPERATION_X_C
+ (
+ dst[ itr ].x = A1 * src[ itr ].x + D1 * src[ itr ].y + G1 * src[ itr ].z;
+ dst[ itr ].y = B1 * src[ itr ].x + E1 * src[ itr ].y + H1 * src[ itr ].z;
+ dst[ itr ].z = C1 * src[ itr ].x + F1 * src[ itr ].y + I1 * src[ itr ].z;
+ );
+
+#undef A1
+#undef B1
+#undef C1
+#undef D1
+#undef E1
+#undef F1
+#undef G1
+#undef H1
+#undef I1
+}
+
+extern ne10_result_t mulcmatvec_cm4x4f_v4f_c (ne10_vec4f_t * dst, const ne10_mat4x4f_t * cst, ne10_vec4f_t * src, ne10_uint32_t count)
+{
+#define A1 cst->c1.r1
+#define B1 cst->c1.r2
+#define C1 cst->c1.r3
+#define D1 cst->c1.r4
+#define E1 cst->c2.r1
+#define F1 cst->c2.r2
+#define G1 cst->c2.r3
+#define H1 cst->c2.r4
+#define I1 cst->c3.r1
+#define J1 cst->c3.r2
+#define K1 cst->c3.r3
+#define L1 cst->c3.r4
+#define M1 cst->c4.r1
+#define N1 cst->c4.r2
+#define O1 cst->c4.r3
+#define P1 cst->c4.r4
+
+ NE10_CMATVEC_OPERATION_X_C
+ (
+ dst[ itr ].x = A1 * src[ itr ].x + E1 * src[ itr ].y + I1 * src[ itr ].z + M1 * src[ itr ].w;
+ dst[ itr ].y = B1 * src[ itr ].x + F1 * src[ itr ].y + J1 * src[ itr ].z + N1 * src[ itr ].w;
+ dst[ itr ].z = C1 * src[ itr ].x + G1 * src[ itr ].y + K1 * src[ itr ].z + O1 * src[ itr ].w;
+ dst[ itr ].w = D1 * src[ itr ].x + H1 * src[ itr ].y + L1 * src[ itr ].z + P1 * src[ itr ].w;
+ );
+
+#undef A1
+#undef B1
+#undef C1
+#undef D1
+#undef E1
+#undef F1
+#undef G1
+#undef H1
+#undef I1
+#undef J1
+#undef K1
+#undef L1
+#undef M1
+#undef N1
+#undef O1
+#undef P1
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mulcmatvec.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies a single 2x2 matrix by eight vec2's
+ @ The elements of the vectors are loaded into registers q8-q11
+ @ by the caller (mulcmatvec_cm2x2f_v2f_neon) in the following
+ @ order:
+ @
+ @ d16=(x1,x3) d18=(y1,y3) d20=(x2,x4) d22=(y2,y4);
+ @ d17=(x5,x7) d19=(y5,y7) d21=(x6,x8) d23=(y6,y8);
+ @
+ @ This macro multiplies these eight vectors by the 2x2 matrix
+ @ which is stored in registers d0[0],d1[0],d2[0], and d3[0].
+ @ The resulting eight vectors are returned in q12-q15
+ @ in the same order as shown above.
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MUL_MAT2x2_VEC2
+ vmul.f32 q10, q8 , d0[0] @ a*x1,x2,x3,x4
+ vmul.f32 q8 , q8 , d1[0] @ b*x1,x2,x3,x4
+ vmul.f32 q11, q9 , d2[0] @ c*y1,y2,y3,y4
+ vmul.f32 q9 , q9 , d3[0] @ d*y1,y2,y3,y4
+
+ vadd.f32 q12, q10, q11 @ 3) res24.x = a*(x1,x2,x3,x4) + c*(y1,y2,y3,y4) @ These results need to be stored in the order noted
+ vadd.f32 q13, q8, q9 @ 4) res24.y = b*(x1,x2,x3,x4) + d*(y1,y2,y3,y4)
+ .endm
+
+
+
+
+ .balign 4
+ .global mulcmatvec_cm2x2f_v2f_neon
+ .thumb
+ .thumb_func
+
+mulcmatvec_cm2x2f_v2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulcmatvec_cm2x2f_v2f ( arm_vec2f_t * dst,
+ @ const arm_mat2x2f_t * cst,
+ @ arm_vec2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ (this register is updated and mvoed to the next entry
+ @ after every store operation)
+ @ r1: *cst, memory pointer to where the constant matrix is kept
+ @ r2: *src & current src entry's address
+ @ r3: int count & the number of items in the input array
+ @
+ @ r4: the number of items that are left to be processed at the
+ @ end of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ @ First we load the constant 2x2 matrix, then each time we load
+ @ eight vectors of 2-floats, multiply each vector with the matrix,
+ @ finally store the resutlting vectors in the destination memory
+ @ address, and move on to the next four vectors.
+
+ @ load the constant matrix
+ @ d0 = m11(a) d2 = m12(c)
+ @ d1 = m21(b) d3 = m22(d)
+ vld4.32 { d0[0], d1[0], d2[0], d3[0] }, [r1]
+
+ cmp r3, #0
+ beq .L_check_mat2x2
+
+ @ load the 1st set of values
+ @ if {V1, V2, V3, V4} are 4 vec2's in memory
+ @ then after the load operations the 4 vectors
+ @ are stored in registers q8-q9 like so:
+ @
+ @ q8=(x1,x2,x3,x4)
+ @ q9=(y1,y2,y3,y4)
+
+ vld2.32 { d16, d17, d18, d19 }, [r2]!
+
+ subs r3, r3, #4 @ 8 for this set
+
+ @ calculate values for the 1st set
+ MUL_MAT2x2_VEC2
+
+ ble .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+ @ store the result for the current set
+ vst2.32 { d24, d25, d26, d27 }, [r0]!
+
+ @ load the next set of values
+ vld2.32 { d16, d17, d18, d19 }, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ MUL_MAT2x2_VEC2
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst2.32 { d24, d25, d26, d27 }, [r0]!
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld2.32 { d16[0], d18[0] }, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ MUL_MAT2x2_VEC2
+
+ @ store the results
+ vst2.32 { d24[0], d26[0] }, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to load four vec3's into registers q8-q10
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_FOUR_VEC3
+ vld3.32 { d16, d18, d20 }, [r2]!
+ vld3.32 { d17, d19, d21 }, [r2]!
+ .endm
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies the constant 3x3 matrix loaded into
+ @ registers d0-d5 by four vec3's that the above macro LOAD_FOUR_VEC3
+ @ loads. The resuls are returned in registers q11, q12, and and q13
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MUL_MAT3x3_VEC3
+
+ vmul.f32 q11, q8 , d0[0]
+ vmla.f32 q11, q9 , d0[1]
+ vmla.f32 q11, q10, d1[0]
+
+ vmul.f32 q12, q8 , d2[0]
+ vmla.f32 q12, q9 , d2[1]
+ vmla.f32 q12, q10, d3[0]
+
+ vmul.f32 q13, q8 , d4[0]
+ vmla.f32 q13, q9 , d4[1]
+ vmla.f32 q13, q10, d5[0]
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to store the resulting vec3's that were returned in
+ @ registers q11 to q13 in the above macro MUL_MAT3x3_VEC3.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_FOUR_VEC3
+
+ vst3.32 { d22, d24, d26 }, [r0]!
+ vst3.32 { d23, d25, d27 }, [r0]!
+
+ .endm
+
+
+
+
+ .align 2
+ .global mulcmatvec_cm3x3f_v3f_neon
+ .thumb
+ .thumb_func
+
+mulcmatvec_cm3x3f_v3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulcmatvec_cm3x3f_v3f ( arm_vec3f_t * dst,
+ @ const arm_mat3x3f_t * cst,
+ @ arm_vec3f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ (this register is updated and mvoed to the next entry
+ @ after every store operation)
+ @ r1: *cst, memory pointer to where the constant matrix is kep
+ @ r2: *src & current src entry's gddress
+ @ r3: int count & the number of items in the input array
+ @
+ @ r4: the number of items that are left to be processed at the
+ @ end of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push { r4 }
+ and r4, r3, #3 @ r3 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ @ First we load the constant 3x3 matrix, then each time we load
+ @ four vectors of 3-floats, multiply each vector with the matrix,
+ @ finally store the resutlting vectors in the destination memory
+ @ address, and move on to the next four vectors.
+
+ @ load the constant matrix into q0-q2
+ vld3.32 { d0 , d2 , d4 }, [r1]!
+ vld3.32 { d1[0], d3[0], d5[0] }, [r1]
+
+ cmp r3, #0
+ beq .L_check_mat3x3
+
+
+ @ load the 1st set of values
+ LOAD_FOUR_VEC3
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ MUL_MAT3x3_VEC3
+
+ ble .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+ @ store the result for the current set
+ STORE_FOUR_VEC3
+
+ @ load the next set of values
+ LOAD_FOUR_VEC3
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ MUL_MAT3x3_VEC3
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+ @ the last iteration for this call
+ @ store the result for the last set
+ STORE_FOUR_VEC3
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+ vld3.32 { d16[0], d18[0], d20[0] }, [r2]!
+
+ subs r4, r4, #1
+
+ MUL_MAT3x3_VEC3
+
+ vst3.32 { d22[0], d24[0], d26[0] }, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+ pop { r4 }
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to load four vec4's into registers q8-q11.
+ @ This macro uses r2 (the thirs parameter in
+ @ mulcmatvec_cm4x4f_v4f_neon) as the address register.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_FOUR_VEC4
+ vld4.32 { d16, d18, d20, d22 }, [r2]!
+ vld4.32 { d17, d19, d21, d23 }, [r2]!
+ .endm
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies the constant 4x4 matrix that is loaded
+ @ in mulcmatvec_cm4x4f_v4f_neon by four vec4's that are loaded in
+ @ the above macro LOAD_FOUR_VEC4.
+ @ The resulting four vectors are returned in registers q12 to q15.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MUL_MAT4x4_VEC4
+
+ vmul.f32 q12, q8 , d0[0]
+ vmla.f32 q12, q9 , d0[1]
+ vmla.f32 q12, q10, d1[0]
+ vmla.f32 q12, q11, d1[1]
+
+ vmul.f32 q13, q8 , d2[0]
+ vmla.f32 q13, q9 , d2[1]
+ vmla.f32 q13, q10, d3[0]
+ vmla.f32 q13, q11, d3[1]
+
+ vmul.f32 q14, q8 , d4[0]
+ vmla.f32 q14, q9 , d4[1]
+ vmla.f32 q14, q10, d5[0]
+ vmla.f32 q14, q11, d5[1]
+
+ vmul.f32 q15, q8 , d6[0]
+ vmla.f32 q15, q9 , d6[1]
+ vmla.f32 q15, q10, d7[0]
+ vmla.f32 q15, q11, d7[1]
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro stores the results from the above macro MUL_MAT4x4_VEC4
+ @ from registers q12-q15 in to the destination memory (r0) which is
+ @ the first parameter of mulcmatvec_cm4x4f_v4f_neon().
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_FOUR_VEC4
+
+ vst4.32 { d24, d26, d28, d30 }, [r0]!
+ vst4.32 { d25, d27, d29, d31 }, [r0]!
+
+ .endm
+
+
+
+
+ .align 2
+ .global mulcmatvec_cm4x4f_v4f_neon
+ .thumb
+ .thumb_func
+
+mulcmatvec_cm4x4f_v4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulcmatvec_cm4x4f_v4f ( arm_vec4f_t * dst,
+ @ const arm_mat4x4f_t * cst,
+ @ arm_vec4f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ (this register is updated and mvoed to the next entry
+ @ after every store operation)
+ @ r1: *cst, pointer to memory where the constant matrix is kept
+ @ r2: *src & current src entry's address
+ @ r3: int count & the number of items in the input array
+ @
+ @ r4: the number of items that are left to be processed at the
+ @ end of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ @ First we load the constant 4x4 matrix, then each time we load
+ @ four vectors of 4-floats, multiply each vector with the matrix,
+ @ finally store the resutlting vectors in the destination memory
+ @ address, and move on to the next four vectors.
+
+ @ load the constant matrix into q0-q3
+ vld4.32 { d0, d2, d4, d6 }, [r1]!
+ vld4.32 { d1, d3, d5, d7 }, [r1]
+
+ cmp r3, #0
+ beq .L_check_mat4x4
+
+ @ load the 1st set of values
+ LOAD_FOUR_VEC4
+ subs r3, r3, #4
+
+ @ calculate values for the 1st set
+ MUL_MAT4x4_VEC4
+
+ ble .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+ @ store the result for the current set
+ STORE_FOUR_VEC4
+
+ @ load the next set of values
+ LOAD_FOUR_VEC4
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ MUL_MAT4x4_VEC4
+
+ bgt .L_mainloop_mat4x4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+ @ the last iteration for this call
+ @ store the result for the last set
+ STORE_FOUR_VEC4
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ vld4.32 { d16[0], d18[0], d20[0], d22[0] }, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ MUL_MAT4x4_VEC4
+
+ @ store the results
+ vst4.32 { d24[0], d26[0], d28[0], d30[0] }, [r0]!
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mulmat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_addmat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t mulmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count)
+{
+#define A1 src1[ itr ].c1.r1
+#define A2 src2[ itr ].c1.r1
+#define B1 src1[ itr ].c1.r2
+#define B2 src2[ itr ].c1.r2
+#define C1 src1[ itr ].c2.r1
+#define C2 src2[ itr ].c2.r1
+#define D1 src1[ itr ].c2.r2
+#define D2 src2[ itr ].c2.r2
+
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = (A1 * A2) + (C1 * B2);
+ dst[ itr ].c1.r2 = (B1 * A2) + (D1 * B2);
+
+ dst[ itr ].c2.r1 = (A1 * C2) + (C1 * D2);
+ dst[ itr ].c2.r2 = (B1 * C2) + (D1 * D2);
+ );
+
+#undef A1
+#undef A2
+#undef B1
+#undef B2
+#undef C1
+#undef C2
+#undef D1
+#undef D2
+}
+
+ne10_result_t mulmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count)
+{
+#define A1 src1[ itr ].c1.r1
+#define A2 src2[ itr ].c1.r1
+#define B1 src1[ itr ].c1.r2
+#define B2 src2[ itr ].c1.r2
+#define C1 src1[ itr ].c1.r3
+#define C2 src2[ itr ].c1.r3
+#define D1 src1[ itr ].c2.r1
+#define D2 src2[ itr ].c2.r1
+#define E1 src1[ itr ].c2.r2
+#define E2 src2[ itr ].c2.r2
+#define F1 src1[ itr ].c2.r3
+#define F2 src2[ itr ].c2.r3
+#define G1 src1[ itr ].c3.r1
+#define G2 src2[ itr ].c3.r1
+#define H1 src1[ itr ].c3.r2
+#define H2 src2[ itr ].c3.r2
+#define I1 src1[ itr ].c3.r3
+#define I2 src2[ itr ].c3.r3
+
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = (A1 * A2) + (D1 * B2) + (G1 * C2);
+ dst[ itr ].c1.r2 = (B1 * A2) + (E1 * B2) + (H1 * C2);
+ dst[ itr ].c1.r3 = (C1 * A2) + (F1 * B2) + (I1 * C2);
+
+ dst[ itr ].c2.r1 = (A1 * D2) + (D1 * E2) + (G1 * F2);
+ dst[ itr ].c2.r2 = (B1 * D2) + (E1 * E2) + (H1 * F2);
+ dst[ itr ].c2.r3 = (C1 * D2) + (F1 * E2) + (I1 * F2);
+
+ dst[ itr ].c3.r1 = (A1 * G2) + (D1 * H2) + (G1 * I2);
+ dst[ itr ].c3.r2 = (B1 * G2) + (E1 * H2) + (H1 * I2);
+ dst[ itr ].c3.r3 = (C1 * G2) + (F1 * H2) + (I1 * I2);
+ );
+
+#undef A1
+#undef A2
+#undef B1
+#undef B2
+#undef C1
+#undef C2
+#undef D1
+#undef D2
+#undef E1
+#undef E2
+#undef F1
+#undef F2
+#undef G1
+#undef G2
+#undef H1
+#undef H2
+#undef I1
+#undef I2
+}
+
+ne10_result_t mulmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count)
+{
+#define A1 src1[ itr ].c1.r1
+#define A2 src2[ itr ].c1.r1
+#define B1 src1[ itr ].c1.r2
+#define B2 src2[ itr ].c1.r2
+#define C1 src1[ itr ].c1.r3
+#define C2 src2[ itr ].c1.r3
+#define D1 src1[ itr ].c1.r4
+#define D2 src2[ itr ].c1.r4
+
+#define E1 src1[ itr ].c2.r1
+#define E2 src2[ itr ].c2.r1
+#define F1 src1[ itr ].c2.r2
+#define F2 src2[ itr ].c2.r2
+#define G1 src1[ itr ].c2.r3
+#define G2 src2[ itr ].c2.r3
+#define H1 src1[ itr ].c2.r4
+#define H2 src2[ itr ].c2.r4
+
+#define I1 src1[ itr ].c3.r1
+#define I2 src2[ itr ].c3.r1
+#define J1 src1[ itr ].c3.r2
+#define J2 src2[ itr ].c3.r2
+#define K1 src1[ itr ].c3.r3
+#define K2 src2[ itr ].c3.r3
+#define L1 src1[ itr ].c3.r4
+#define L2 src2[ itr ].c3.r4
+
+#define M1 src1[ itr ].c4.r1
+#define M2 src2[ itr ].c4.r1
+#define N1 src1[ itr ].c4.r2
+#define N2 src2[ itr ].c4.r2
+#define O1 src1[ itr ].c4.r3
+#define O2 src2[ itr ].c4.r3
+#define P1 src1[ itr ].c4.r4
+#define P2 src2[ itr ].c4.r4
+
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = (A1 * A2) + (E1 * B2) + (I1 * C2) + (M1 * D2);
+ dst[ itr ].c1.r2 = (B1 * A2) + (F1 * B2) + (J1 * C2) + (N1 * D2);
+ dst[ itr ].c1.r3 = (C1 * A2) + (G1 * B2) + (K1 * C2) + (O1 * D2);
+ dst[ itr ].c1.r4 = (D1 * A2) + (H1 * B2) + (L1 * C2) + (P1 * D2);
+
+ dst[ itr ].c2.r1 = (A1 * E2) + (E1 * F2) + (I1 * G2) + (M1 * H2);
+ dst[ itr ].c2.r2 = (B1 * E2) + (F1 * F2) + (J1 * G2) + (N1 * H2);
+ dst[ itr ].c2.r3 = (C1 * E2) + (G1 * F2) + (K1 * G2) + (O1 * H2);
+ dst[ itr ].c2.r4 = (D1 * E2) + (H1 * F2) + (L1 * G2) + (P1 * H2);
+
+ dst[ itr ].c3.r1 = (A1 * I2) + (E1 * J2) + (I1 * K2) + (M1 * L2);
+ dst[ itr ].c3.r2 = (B1 * I2) + (F1 * J2) + (J1 * K2) + (N1 * L2);
+ dst[ itr ].c3.r3 = (C1 * I2) + (G1 * J2) + (K1 * K2) + (O1 * L2);
+ dst[ itr ].c3.r4 = (D1 * I2) + (H1 * J2) + (L1 * K2) + (P1 * L2);
+
+ dst[ itr ].c4.r1 = (A1 * M2) + (E1 * N2) + (I1 * O2) + (M1 * P2);
+ dst[ itr ].c4.r2 = (B1 * M2) + (F1 * N2) + (J1 * O2) + (N1 * P2);
+ dst[ itr ].c4.r3 = (C1 * M2) + (G1 * N2) + (K1 * O2) + (O1 * P2);
+ dst[ itr ].c4.r4 = (D1 * M2) + (H1 * N2) + (L1 * O2) + (P1 * P2);
+ );
+
+#undef A1
+#undef A2
+#undef B1
+#undef B2
+#undef C1
+#undef C2
+#undef D1
+#undef D2
+#undef E1
+#undef E2
+#undef F1
+#undef F2
+#undef G1
+#undef G2
+#undef H1
+#undef H2
+#undef I1
+#undef I2
+#undef J1
+#undef J2
+#undef K1
+#undef K2
+#undef L1
+#undef L2
+#undef M1
+#undef M2
+#undef N1
+#undef N2
+#undef O1
+#undef O2
+#undef P1
+#undef P2
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_mulmat.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .balign 4
+ .global mulmat_2x2f_neon
+ .thumb
+ .thumb_func
+
+mulmat_2x2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulmat_2x2f(arm_mat2x2f_t * dst,
+ @ arm_mat2x2f_t * src1,
+ @ arm_mat2x2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_mat2x2
+
+ @ We load four 2x2 matrices at a time, multiply them to
+ @ get two resulting 2x2 matrices, store them in the destination
+ @ and then move on to the next four matrices.
+
+ @ load the 1st set of values
+ vld4.32 { d0, d1, d2, d3 }, [r1]!
+ vld4.32 { d4, d5, d6, d7 }, [r2]!
+ subs r3, r3, #4 @ 2 for this set, and 2 for the 2nd set
+
+ @ calculate values for the 1st set
+ vmul.f32 d16, d0, d4
+ vmul.f32 d17, d1, d4
+ vmul.f32 d18, d0, d6
+ vmul.f32 d19, d1, d6
+
+ vmla.f32 d16, d2, d5
+ vmla.f32 d17, d3, d5
+ vmla.f32 d18, d2, d7
+ vmla.f32 d19, d3, d7
+
+
+ @ load the 2nd set of values
+ vld4.32 { d0, d1, d2, d3 }, [r1]!
+ vld4.32 { d4, d5, d6, d7 }, [r2]!
+
+ ble .L_mainloopend_mat2x2
+
+.L_mainloop_mat2x2:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ vst4.32 { d16, d17, d18, d19}, [r0]!
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ vmul.f32 d16, d0, d4
+ vmul.f32 d17, d1, d4
+ vmul.f32 d18, d0, d6
+ vmul.f32 d19, d1, d6
+
+ vmla.f32 d16, d2, d5
+ vmla.f32 d17, d3, d5
+ vmla.f32 d18, d2, d7
+ vmla.f32 d19, d3, d7
+
+ @ load the next (e.g. 3rd) set of values
+ subs r3, r3, #2
+ vld4.32 { d0, d1, d2, d3 }, [r1]!
+ vld4.32 { d4, d5, d6, d7 }, [r2]!
+
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ vst4.32 { d16, d17, d18, d19}, [r0]!
+
+ @ calculate values for the last (e.g. 3rd) set
+ vmul.f32 d16, d0, d4
+ vmul.f32 d17, d1, d4
+ vmul.f32 d18, d0, d6
+ vmul.f32 d19, d1, d6
+
+ vmla.f32 d16, d2, d5
+ vmla.f32 d17, d3, d5
+ vmla.f32 d18, d2, d7
+ vmla.f32 d19, d3, d7
+
+ @ store the result for the last (e.g. 3rd) set
+ vst4.32 { d16, d17, d18, d19}, [r0]!
+
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld4.32 { d0[0], d1[0], d2[0], d3[0] }, [r1]!
+ vld4.32 { d4[0], d5[0], d6[0], d7[0] }, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vmul.f32 d16, d0, d4
+ vmul.f32 d17, d1, d4
+ vmul.f32 d18, d0, d6
+ vmul.f32 d19, d1, d6
+
+ vmla.f32 d16, d2, d5
+ vmla.f32 d17, d3, d5
+ vmla.f32 d18, d2, d7
+ vmla.f32 d19, d3, d7
+
+ vst4.32 { d16[0], d17[0], d18[0], d19[0] }, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to load four 3x3 matrices, two from the first source which
+ @ according to the function signatures is src1 (r1) and
+ @ another two from the second source which is src2 (r2)
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_3x3MATS
+
+ # load two 3x3 matrices from src1
+ vld1.32 { q0-q1 }, [r1]!
+ vld1.32 { d8[0] }, [r1]!
+ vld1.32 { q2-q3 }, [r1]!
+ vld1.32 { d8[1] }, [r1]!
+
+ # load two 3x3 matrices from src2
+ vld1.32 { q8-q9 }, [r2]!
+ vld1.32 { d9[0] }, [r2]!
+ vld1.32 { q10-q11 }, [r2]!
+ vld1.32 { d9[1] }, [r2]!
+
+
+ # rearrange them both
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies two pairs of 3x3 matrices that were
+ @ loaded using the above LOAD_3x3MATS macro in registers q0-q11.
+ @ The two resulting matrices are returned in q12, q13, q14, q15, & d9
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MULTIPLY_3x3MATS
+
+ @ a = d0 & d16
+ @ b = d4 & d20
+ @ c = d1 & d17
+ @ d = d5 & d21
+ @ e = d2 & d18
+ @ f = d6 & d22
+ @ g = d3 & d19
+ @ h = d7 & d23
+ @ i = d8 & d9
+
+ vmul.f32 d24, d0, d16
+ vmul.f32 d28, d4, d16
+ vmul.f32 d25, d1, d16
+ vmul.f32 d29, d0, d21
+ vmul.f32 d26, d4, d21
+ vmul.f32 d30, d1, d21
+ vmul.f32 d27, d0, d19
+ vmul.f32 d31, d4, d19
+ vmul.f32 d10, d1, d19
+
+ vmla.f32 d24, d5, d20
+ vmla.f32 d28, d2, d20
+ vmla.f32 d25, d6, d20
+ vmla.f32 d29, d5, d18
+ vmla.f32 d26, d2, d18
+ vmla.f32 d30, d6, d18
+ vmla.f32 d27, d5, d23
+ vmla.f32 d31, d2, d23
+ vmla.f32 d10, d6, d23
+
+ vmla.f32 d24, d3, d17
+ vmla.f32 d28, d7, d17
+ vmla.f32 d25, d8, d17
+ vmla.f32 d29, d3, d22
+ vmla.f32 d26, d7, d22
+ vmla.f32 d30, d8, d22
+ vmla.f32 d27, d3, d9
+ vmla.f32 d31, d7, d9
+ vmla.f32 d10, d8, d9
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to store the two resulting 3x3 matrices from
+ @ the above MULTIPLY_3x3MATS macro (q12-q15, & d9 are stored)
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_3x3MATS
+
+ # rearrange them both
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+
+ # store two 3x3 matrices to dst
+ vst1.32 { q12-q13 }, [r0]!
+ vst1.32 { d10[0] }, [r0]!
+ vst1.32 { q14-q15 }, [r0]!
+ vst1.32 { d10[1] }, [r0]!
+
+ .endm
+
+
+
+
+ .align 2
+ .global mulmat_3x3f_neon
+ .thumb
+ .thumb_func
+mulmat_3x3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulmat_3x3f(arm_mat3x3f_t * dst,
+ @ arm_mat3x3f_t * src1,
+ @ arm_mat3x3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push { r4 }
+ vpush { d8, d9, d10 }
+ and r4, r3, #3 @ r3 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_mat3x3
+
+ @ load the 1st set of values
+ LOAD_3x3MATS
+ subs r3, r3, #4 @ 2 for this set, and 2 for the 2nd set
+
+ @ calculate values for the 1st set
+ MULTIPLY_3x3MATS
+
+ @ load the 2nd set of values
+ LOAD_3x3MATS
+ ble .L_mainloopend_mat3x3
+
+.L_mainloop_mat3x3:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ STORE_3x3MATS
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ MULTIPLY_3x3MATS
+
+ @ load the next (e.g. 3rd) set of values
+ LOAD_3x3MATS
+
+ subs r3, r3, #2
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ STORE_3x3MATS
+
+ @ calculate values for the last (e.g. 3rd) set
+ MULTIPLY_3x3MATS
+
+ @ store the result for the last (e.g. 3rd) set
+ STORE_3x3MATS
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+ @ load the next (e.g. 3rd) set of values
+ vld1.32 { q0-q1 }, [r1]!
+ vld1.32 { d8[0] }, [r1]!
+ vld1.32 { q8-q9 }, [r2]!
+ vld1.32 { d9[0] }, [r2]!
+
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+
+ subs r4, r4, #1
+
+ @ calculate values for the last (e.g. 3rd) set
+ MULTIPLY_3x3MATS
+
+ @ store the result for the last (e.g. 3rd) set
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+
+ vst1.32 { q12-q13 }, [r0]!
+ vst1.32 { d10[0] }, [r0]!
+
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+ vpop { d8, d9, d10 }
+ pop { r4 }
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ A macro to load a pair of 4x4 matrices from src1 (r1) and
+ @ src2 (r2) into registers q0-q3 & q8-q11.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro LOAD_4x4MATS
+
+ # load a 4x4 matrix from src1
+ vld1.32 { q8-q9 }, [r1]!
+ vld1.32 {q10-q11}, [r1]!
+
+ # load a 4x4 matrix from src2
+ vld1.32 {q0-q1}, [r2]!
+ vld1.32 {q2-q3}, [r2]!
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro multiplies the two 4x4 matrices loaded in the
+ @ above LOAD_4x4MATS macro and returns the resulting 4x4
+ @ matrix in q12-q15.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro MULTIPLY_4x4MATS
+
+ vmul.f32 q12, q8, d0[0]
+ vmul.f32 q13, q8, d2[0]
+ vmul.f32 q14, q8, d4[0]
+ vmul.f32 q15, q8, d6[0]
+
+ vmla.f32 q12, q9, d0[1]
+ vmla.f32 q13, q9, d2[1]
+ vmla.f32 q14, q9, d4[1]
+ vmla.f32 q15, q9, d6[1]
+
+
+ vmla.f32 q12, q10, d1[0]
+ vmla.f32 q13, q10, d3[0]
+ vmla.f32 q14, q10, d5[0]
+ vmla.f32 q15, q10, d7[0]
+
+ vmla.f32 q12, q11, d1[1]
+ vmla.f32 q13, q11, d3[1]
+ vmla.f32 q14, q11, d5[1]
+ vmla.f32 q15, q11, d7[1]
+
+ .endm
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro stores the resulting 4x4 matrix which is
+ @ returned by the above MULTIPLY_4x4MATS macro from registers
+ @ q12-q15 into the dst (r0).
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_4x4MATS
+
+ # store two 3x3 matrices to dst
+ vst1.32 { q12-q13 }, [r0]!
+ vst1.32 { q14-q15 }, [r0]!
+
+ .endm
+
+
+
+
+ .align 2
+ .global mulmat_4x4f_neon
+ .thumb
+ .thumb_func
+mulmat_4x4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t mulmat_4x4f(arm_mat4x4f_t * dst,
+ @ arm_mat4x4f_t * src1,
+ @ arm_mat4x4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_mat4x4
+
+ @ load the 1st set of values
+ LOAD_4x4MATS
+
+ subs r3, r3, #2
+
+ @ calculate values for the 1st set
+ MULTIPLY_4x4MATS
+
+ @ load the 2nd set of values
+ LOAD_4x4MATS
+
+ ble .L_mainloopend_mat4x4
+
+.L_mainloop_mat4x4:
+ @ store the result for the 1st/next (e.g. 3rd) set
+ STORE_4x4MATS
+
+ @ calculate values for the 2nd/next (e.g. 3rd) set
+ MULTIPLY_4x4MATS
+
+ @ load the next (e.g. 3rd) set of values
+ subs r3, r3, #1
+ LOAD_4x4MATS
+
+ bgt .L_mainloop_mat4x4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+ @ the last iteration for this call
+ @ store the result for the set of values before the last one (e.g 2nd set)
+ STORE_4x4MATS
+
+ @ calculate values for the last (e.g. 3rd) set
+ MULTIPLY_4x4MATS
+
+ @ store the result for the last (e.g. 3rd) set
+ STORE_4x4MATS
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ LOAD_4x4MATS
+
+ subs r4, r4, #1
+
+ @ calculate values
+ MULTIPLY_4x4MATS
+
+ @ store the results
+ STORE_4x4MATS
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_normalize.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global normalize_vec2f_asm
+ .thumb
+ .thumb_func
+
+normalize_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t normalize_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, unsigned int count)
+ @
+ @ r0: *dst and current destination item's address
+ @ r1: *src and current source item's address
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndVec2F
+ add r0, r0, r2, lsl #3 @ r0 = r0 + count * 8
+ add r1, r1, r2, lsl #3 @ r1 = r1 + count * 8
+
+.LoopBeginVec2F:
+ vldmdb r1!, {s10-s11} @ load s10 = x and S11 = y
+ vmul.f32 s14, s10, s10 @ s14 = x*x
+ vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
+ vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
+ vdiv.f32 s10, s10, s15 @ s10 = x / length
+ vdiv.f32 s11, s11, s15 @ s11 = y / length
+ vstmdb r0!, {s10-s11} @ store the results and move the pointer
+ subs r2, r2, #1 @ decrement the loop counter
+ bne .LoopBeginVec2F @ loop if r4 is still positive or zero
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
+
+
+
+
+ .balign 4
+ .global normalize_vec3f_asm
+ .thumb
+ .thumb_func
+
+normalize_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t normalize_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, unsigned int count)
+ @
+ @ r0: *dst and current destination item's address
+ @ r1: *src and current source item's address
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndVec3F
+ add r0, r0, r2, lsl #3 @ ...
+ add r0, r0, r2, lsl #2 @ r0 = r0 + count * 12
+ add r1, r1, r2, lsl #3 @ ...
+ add r1, r1, r2, lsl #2 @ r1 = r1 + count * 12
+
+.LoopBeginVec3F:
+ vldmdb r1!, {s10-s12}
+ vmul.f32 s14, s10, s10 @ s14 = x*x
+ vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
+ vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
+ vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
+ vdiv.f32 s10, s10, s15 @ s10 = x / length
+ vdiv.f32 s11, s11, s15 @ s11 = y / length
+ vdiv.f32 s12, s12, s15 @ s12 = z / length
+ vstmdb r0!, {s10-s12} @ store the results and move the pointer
+ subs r2, r2, #1 @ decrement the loop counter
+ bne .LoopBeginVec3F @ loop if r4 is still positive or zero
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
+
+
+
+
+ .balign 4
+ .global normalize_vec4f_asm
+ .thumb
+ .thumb_func
+
+normalize_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t normalize_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, unsigned int count)
+ @
+ @ r0: *dst and current destination item's address
+ @ r1: *src and current source item's address
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndVec4F
+ add r0, r0, r2, lsl #4 @ r0 = r0 + count * 16
+ add r1, r1, r2, lsl #4 @ r1 = r1 + count * 16
+
+.LoopBeginVec4F:
+ vldmdb r1!, {s10-s13}
+ vmul.f32 s14, s10, s10 @ s14 = x*x
+ vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
+ vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
+ vmla.f32 s14, s13, s13 @ s14 = x*x + y*y + z*z + w*w
+ vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
+ vdiv.f32 s10, s10, s15 @ s10 = x / length
+ vdiv.f32 s11, s11, s15 @ s11 = y / length
+ vdiv.f32 s12, s12, s15 @ s12 = z / length
+ vdiv.f32 s13, s13, s15 @ s12 = w / length
+ vstmdb r0!, {s10-s13} @ store the results and move the pointer
+ subs r2, r2, #1 @ decrement the loop counter
+ bne .LoopBeginVec4F @ loop if r4 is still positive or zero
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_normalize.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+#include <math.h>
+
+ne10_result_t normalize_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, ne10_uint32_t count)
+{
+ ne10_float32_t len;
+
+ NE10_LEN_OPERATION_X_C
+ (
+ len = sqrt (src[ itr ].x * src[ itr ].x +
+ src[ itr ].y * src[ itr ].y) ;
+
+ dst[ itr ].x = src[ itr ].x / len;
+ dst[ itr ].y = src[ itr ].y / len;
+ );
+}
+
+ne10_result_t normalize_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, ne10_uint32_t count)
+{
+ ne10_float32_t len;
+
+ NE10_LEN_OPERATION_X_C
+ (
+ len = sqrt (src[ itr ].x * src[ itr ].x +
+ src[ itr ].y * src[ itr ].y +
+ src[ itr ].z * src[ itr ].z);
+
+ dst[ itr ].x = src[ itr ].x / len;
+ dst[ itr ].y = src[ itr ].y / len;
+ dst[ itr ].z = src[ itr ].z / len;
+ );
+}
+
+ne10_result_t normalize_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, ne10_uint32_t count)
+{
+ ne10_float32_t len;
+
+ NE10_LEN_OPERATION_X_C
+ (
+ len = sqrt (src[ itr ].x * src[ itr ].x +
+ src[ itr ].y * src[ itr ].y +
+ src[ itr ].z * src[ itr ].z +
+ src[ itr ].w * src[ itr ].w);
+
+ dst[ itr ].x = src[ itr ].x / len;
+ dst[ itr ].y = src[ itr ].y / len;
+ dst[ itr ].z = src[ itr ].z / len;
+ dst[ itr ].w = src[ itr ].w / len;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_normalize.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .balign 4
+ .global normalize_vec2f_neon
+ .thumb
+ .thumb_func
+
+normalize_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t normalize_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cbz r2, .L_check_vec2
+
+ @ load values for the first iteration
+ vld2.32 {q0-q1}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate sum of square of the components
+ vmul.f32 q2, q0, q0
+ vmla.f32 q2, q1, q1
+
+ ble .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+ @ load the next set of values
+ vmov.f32 q10, q0
+ vmov.f32 q11, q1
+ vld2.32 {q0-q1}, [r1]!
+ subs r2, r2, #4
+
+ @ get reciprocal SQRT of the last vector while loading a new vector
+ vrsqrte.f32 q3, q2
+ vmul.f32 q4, q2, q3
+ vrsqrts.f32 q4, q4, q3
+ vmul.f32 q4, q3, q4
+
+ @ normalize the components
+ vmul.f32 q3, q10, q4 @ q3 = q0(8) * q4
+ vmul.f32 q4, q11, q4 @ q4 = q1(9) * q4
+
+ vst2.32 {d6,d7,d8,d9}, [r0]!
+
+ @ calculate sum of square of the components
+ vmul.f32 q2, q0, q0
+ vmla.f32 q2, q1, q1
+
+ bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+ @ the last iteration for this call
+ @ get reciprocal SQRT of the last vector
+ vrsqrte.f32 q3, q2
+ vmul.f32 q4, q2, q3
+ vrsqrts.f32 q4, q4, q3
+ vmul.f32 q4, q3, q4
+
+ @ normalize the components
+ vmul.f32 q3, q0, q4 @ q3 = q0 * q4
+ vmul.f32 q4, q1, q4 @ q4 = q1 * q4
+
+ vst2.32 {d6,d7,d8,d9}, [r0]!
+
+.L_check_vec2:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_vec2
+
+.L_secondloop_vec2:
+ @ process the last few items left in the input array
+ vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
+
+ subs r3, r3, #1
+
+ @ calculate sum of square of the components
+ vmul.f32 d1, d0, d0 @ d1= { V.x^2, V.y^2 };
+ vpadd.f32 d3, d1, d1 @ d3= { V.x^2 + (V.y^2), V.y^2 + (V.x^2) };
+
+
+ @ get reciprocal SQRT of the last vector
+ vrsqrte.f32 d2, d3
+ vmul.f32 d1, d3, d2
+ vrsqrts.f32 d1, d1, d2
+ vmul.f32 d1, d2, d1
+
+ @ normalize the components
+ vmul.f32 d0, d0, d1
+
+ vst1.32 {d0}, [r0]!
+
+ bgt .L_secondloop_vec2
+
+.L_return_vec2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global normalize_vec3f_neon
+ .thumb
+ .thumb_func
+normalize_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t normalize_vec3f(arm_vec3t_t * dst,
+ @ arm_vec3f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_vec3
+
+ @ load values for the first iteration
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate sum of square of the components
+ vmul.f32 q3, q0, q0
+ vmla.f32 q3, q1, q1
+ vmla.f32 q3, q2, q2
+
+
+ ble .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+ @ load the next set of values
+ vmov.f32 q10, q0
+ vmov.f32 q11, q1
+ vmov.f32 q12, q2
+
+ vld3.32 {d0,d2,d4}, [r1]!
+ vld3.32 {d1,d3,d5}, [r1]!
+ subs r2, r2, #4
+
+ @ get reciprocal SQRT of the last vector while loading a new vector
+ vrsqrte.f32 q5, q3
+ vmul.f32 q4, q3, q5
+ vrsqrts.f32 q4, q4, q5
+ vmul.f32 q4, q5, q4
+
+ @ normalize the components
+ vmul.f32 q5, q10, q4
+ vmul.f32 q6, q11, q4
+ vmul.f32 q7, q12, q4
+
+ vst3.32 {d10, d12, d14}, [r0]!
+ vst3.32 {d11, d13, d15}, [r0]!
+
+ @ calculate sum of square of the components
+ vmul.f32 q3, q0, q0
+ vmla.f32 q3, q1, q1
+ vmla.f32 q3, q2, q2
+
+ bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+ @ the last iteration for this call
+ @ get reciprocal SQRT of the last vector
+ vrsqrte.f32 q5, q3
+ vmul.f32 q4, q3, q5
+ vrsqrts.f32 q4, q4, q5
+ vmul.f32 q4, q5, q4
+
+ @ normalize the components
+ vmul.f32 q5, q0, q4
+ vmul.f32 q6, q1, q4
+ vmul.f32 q7, q2, q4
+
+ vst3.32 {d10, d12, d14}, [r0]!
+ vst3.32 {d11, d13, d15}, [r0]!
+
+.L_check_vec3:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_vec3
+
+.L_secondloop_vec3:
+ @ process the last few items left in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, -, -, - };
+ @ q1 = { V.y, -, -, - };
+ @ q2 = { V.z, -, -, - };
+ subs r3, r3, #1
+
+ @ calculate sum of square of the components
+ vmul.f32 q3, q0, q0 @ V.x^2
+ vmla.f32 q3, q1, q1 @ V.x^2 + V.y^2
+ vmla.f32 q3, q2, q2 @ V.x^2 + V.y^2 + V.z^2
+
+
+ @ get reciprocal SQRT of the last vector
+ vrsqrte.f32 q5, q3
+ vmul.f32 q4, q3, q5
+ vrsqrts.f32 q4, q4, q5
+ vmul.f32 q4, q5, q4
+
+ @ normalize the components
+ vmul.f32 q0, q0, q4
+ vmul.f32 q1, q1, q4
+ vmul.f32 q2, q2, q4
+
+ vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
+
+ bgt .L_secondloop_vec3
+
+.L_return_vec3:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 2
+ .global normalize_vec4f_neon
+ .thumb
+ .thumb_func
+normalize_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t normalize_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src,
+ @ unsigned int count);
+ @
+ @ r0: *dst & the current dst entry's address
+ @ r1: *src & current src entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @ r3: the number of items that are left to be processed at the end of
+ @ the input array
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_vec4
+
+ @ load values for the first iteration
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #4
+
+ @ calculate sum of square of the components
+ vmul.f32 q5, q0, q0
+ vmla.f32 q5, q1, q1
+ vmla.f32 q5, q2, q2
+ vmla.f32 q5, q3, q3
+
+ ble .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+ @ load the next set of values
+ vmov q10, q0
+ vmov q11, q1
+ vmov q12, q2
+ vmov q13, q3
+
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ subs r2, r2, #4
+
+ @ get reciprocal SQRT of the last vector while loading a new vector
+ vrsqrte.f32 q6, q5
+ vmul.f32 q4, q5, q6
+ vrsqrts.f32 q4, q4, q6
+ vmul.f32 q4, q6, q4
+
+ @ normalize the components
+ vmul.f32 q10, q10, q4
+ vmul.f32 q11, q11, q4
+ vmul.f32 q12, q12, q4
+ vmul.f32 q13, q13, q4
+
+ vst4.32 {d20, d22, d24, d26}, [r0]!
+ vst4.32 {d21, d23, d25, d27}, [r0]!
+
+ @ calculate sum of square of the components
+ vmul.f32 q5, q0, q0
+ vmla.f32 q5, q1, q1
+ vmla.f32 q5, q2, q2
+ vmla.f32 q5, q3, q3
+
+ bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec4:
+ @ the last iteration for this call
+ @ get reciprocal SQRT of the last vector
+ vrsqrte.f32 q6, q5
+ vmul.f32 q4, q5, q6
+ vrsqrts.f32 q4, q4, q6
+ vmul.f32 q4, q6, q4
+
+ @ normalize the components
+ vmul.f32 q0, q0, q4
+ vmul.f32 q1, q1, q4
+ vmul.f32 q2, q2, q4
+ vmul.f32 q3, q3, q4
+
+ vst4.32 {d0, d2, d4, d6}, [r0]!
+ vst4.32 {d1, d3, d5, d7}, [r0]!
+
+.L_check_vec4:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_vec4
+
+.L_secondloop_vec4:
+ @ process the last few items left in the input array
+ vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V.x, -, -, - };
+ @ q1 = { V.y, -, -, - };
+ @ q2 = { V.z, -, -, - };
+ subs r3, r3, #1
+
+ @ calculate sum of square of the components
+ vmul.f32 q4, q0, q0 @ V.x^2
+ vmla.f32 q4, q1, q1 @ V.x^2 + V.y^2
+ vmla.f32 q4, q2, q2 @ V.x^2 + V.y^2 + V.z^2
+ vmla.f32 q4, q3, q3 @ V.x^2 + V.y^2 + V.z^2 + V.w^2
+
+ @ get reciprocal SQRT of the last vector
+ vrsqrte.f32 q5, q4
+ vmul.f32 q6, q4, q5
+ vrsqrts.f32 q6, q6, q5
+ vmul.f32 q6, q5, q6
+
+ @ normalize the components
+ vmul.f32 q0, q0, q6
+ vmul.f32 q1, q1, q6
+ vmul.f32 q2, q2, q6
+ vmul.f32 q3, q3, q6
+
+ vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]! @ The values are loaded like so:
+
+ bgt .L_secondloop_vec4
+
+.L_return_vec4:
+ @ return
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_rsbc.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global rsbc_float_asm
+ .thumb
+ .thumb_func
+
+rsbc_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t rsbc_float(arm_vec2f_t * dst,
+ @ arm_float_t * src, const arm_float_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndFloat
+ mov r5, #0
+
+.LoopBeginFloat:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i]
+ vmov s3, r2 @ Get cst into register s3
+ vsub.f32 s10, s3, s1 @ s10 = cst - src[i]
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the result back into the main memory
+ add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global rsbc_vec2f_asm
+ .thumb
+ .thumb_func
+
+rsbc_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t rsbc_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec2F
+ mov r5, #0
+
+.LoopBeginVec2F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x and src[i].y
+ vldr s2, [r6, #4]
+ vldr s3, [r2, #0] @ Load cst->x and cst->y
+ vldr s4, [r2, #4]
+ vsub.f32 s10, s3, s1 @ s10 = cst->x - src[i].x
+ vsub.f32 s11, s4, s2 @ s11 = cst->y - src[i].y
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global rsbc_vec3f_asm
+ .thumb
+ .thumb_func
+
+rsbc_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t rsbc_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec3F
+ mov r5, #0
+
+.LoopBeginVec3F:
+
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
+ vldr s5, [r2, #4]
+ vldr s6, [r2, #8]
+ vsub.f32 s10, s4, s1 @ s10 = cst->x - src[i].x
+ vsub.f32 s11, s5, s2 @ s11 = cst->y - src[i].y
+ vsub.f32 s12, s6, s3 @ s12 = cst->z - src[i].z
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global rsbc_vec4f_asm
+ .thumb
+ .thumb_func
+
+rsbc_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t rsbc_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec4F
+ mov r5, #0
+
+.LoopBeginVec4F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r6, #12]
+ vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
+ vldr s6, [r2, #4]
+ vldr s7, [r2, #8]
+ vldr s8, [r2, #12]
+ vsub.f32 s10, s5, s1 @ s10 = cst->x - src[i].x
+ vsub.f32 s11, s6, s2 @ s11 = cst->y - src[i].y
+ vsub.f32 s12, s7, s3 @ s12 = cst->z - src[i].z
+ vsub.f32 s13, s8, s4 @ s13 = cst->w - src[i].w
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ vstr s13, [r7, #12]
+ add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_rsbc.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t rsbc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ] = cst - src[ itr ];
+ );
+}
+
+ne10_result_t rsbc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = cst->x - src[ itr ].x;
+ dst[ itr ].y = cst->y - src[ itr ].y;
+ );
+}
+
+ne10_result_t rsbc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = cst->x - src[ itr ].x;
+ dst[ itr ].y = cst->y - src[ itr ].y;
+ dst[ itr ].z = cst->z - src[ itr ].z;
+ );
+}
+
+ne10_result_t rsbc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = cst->x - src[ itr ].x;
+ dst[ itr ].y = cst->y - src[ itr ].y;
+ dst[ itr ].z = cst->z - src[ itr ].z;
+ dst[ itr ].w = cst->w - src[ itr ].w;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_rsbc.neon.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+ne10_result_t rsbc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_FLOAT_NEON
+ (
+ n_dst = vsubq_f32 (n_cst, n_src);
+ ,
+ n_tmp_src = vsub_f32 (n_tmp_cst, n_tmp_src);
+ );
+}
+
+ne10_result_t rsbc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC2F_NEON
+ (
+ n_dst = vsubq_f32 (n_cst, n_src);
+ ,
+ n_tmp_src = vsub_f32 (n_tmp_cst, n_tmp_src);
+ );
+}
+
+ne10_result_t rsbc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC3F_NEON
+ (
+ n_dst1 = vsubq_f32 (n_cst1, n_src1);
+ n_dst2 = vsubq_f32 (n_cst2, n_src2);
+ n_dst3 = vsubq_f32 (n_cst3, n_src3);
+ ,
+ n_tmp_src.val[0] = vsub_f32 (n_tmp_cst.val[0], n_tmp_src.val[0]);
+ n_tmp_src.val[1] = vsub_f32 (n_tmp_cst.val[1], n_tmp_src.val[1]);
+ n_tmp_src.val[2] = vsub_f32 (n_tmp_cst.val[2], n_tmp_src.val[2]);
+ );
+}
+
+ne10_result_t rsbc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC4F_NEON
+ (
+ n_dst = vsubq_f32 (n_cst, n_src);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_setc.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global setc_float_asm
+ .thumb
+ .thumb_func
+
+setc_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t setc_float(arm_float_t * dst,
+ @ const arm_float_t cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: cst
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r2, .LoopEndFloat
+
+.LoopBeginFloat:
+ str r1, [r0], #4 @ Store it back into the main memory
+ subs r2, r2, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
+
+
+
+
+ .balign 4
+ .global setc_vec2f_asm
+ .thumb
+ .thumb_func
+
+setc_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t setc_vec2f(arm_vec2f_t * dst,
+ @ const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *cst
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5}
+ cbz r2, .LoopEndVec2F
+ ldr r4, [r1, #0] @ Load cst->x into r4
+ ldr r5, [r1, #4] @ Load cst->y into r5
+
+.LoopBeginVec2F:
+ str r4, [r0], #4 @ Store them in the destination
+ str r5, [r0], #4
+ subs r2, r2, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5}
+ bx lr
+
+
+
+
+ .balign 4
+ .global setc_vec3f_asm
+ .thumb
+ .thumb_func
+
+setc_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t setc_vec3f(arm_vec3f_t * dst,
+ @ const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *cst
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6}
+ cbz r2, .LoopEndVec3F
+ ldr r4, [r1, #0] @ Load cst->x into r4
+ ldr r5, [r1, #4] @ Load cst->y into r5
+ ldr r6, [r1, #8] @ r6 = cst->z
+
+.LoopBeginVec3F:
+ str r4, [r0], #4 @ Store them in the destination
+ str r5, [r0], #4
+ str r6, [r0], #4
+ subs r2, r2, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6}
+ bx lr
+
+
+
+
+ .balign 4
+ .global setc_vec4f_asm
+ .thumb
+ .thumb_func
+
+setc_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t setc_vec4f(arm_vec4f_t * dst,
+ @ const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *cst
+ @ r2: int count
+ @
+ @ r2: loop counter
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r2, .LoopEndVec4F
+ ldr r4, [r1, #0] @ Load cst->x into r4
+ ldr r5, [r1, #4] @ Load cst->y into r5
+ ldr r6, [r1, #8] @ r6 = cst->z
+ ldr r7, [r1, #12] @ r7 = cst->w
+
+.LoopBeginVec4F:
+ str r4, [r0], #4 @ Store them in the destination
+ str r5, [r0], #4
+ str r6, [r0], #4
+ str r7, [r0], #4
+ subs r2, r2, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_setc.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t setc_float_c (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_X_C
+ (
+ dst[itr] = cst;
+ );
+}
+
+ne10_result_t setc_vec2f_c (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_X_C
+ (
+ dst[itr].x = cst->x;
+ dst[itr].y = cst->y;
+ );
+}
+
+ne10_result_t setc_vec3f_c (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_X_C
+ (
+ dst[itr].x = cst->x;
+ dst[itr].y = cst->y;
+ dst[itr].z = cst->z;
+ );
+}
+
+ne10_result_t setc_vec4f_c (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_X_C
+ (
+ dst[itr].x = cst->x;
+ dst[itr].y = cst->y;
+ dst[itr].z = cst->z;
+ dst[itr].w = cst->w;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_setc.neon.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+ne10_result_t setc_float_neon (ne10_float32_t * dst, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_FLOAT_NEON
+ (
+ ;// The cst need not be altered
+ ,
+ ;// n_tmp_cst need not be altered
+ );
+}
+
+ne10_result_t setc_vec2f_neon (ne10_vec2f_t * dst, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_VEC2F_NEON
+ (
+ ;// The cst need not be altered
+ ,
+ ;// n_tmp_cst need not be altered
+ );
+}
+
+ne10_result_t setc_vec3f_neon (ne10_vec3f_t * dst, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_VEC3F_NEON
+ (
+ ;// cst1, cst2, and cst3 need not be altered
+ ,
+ ;// n_tmp_cst.val[0], .val[1], and .val[2] need not be altered
+ );
+}
+
+ne10_result_t setc_vec4f_neon (ne10_vec4f_t * dst, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_SETC_OPERATION_VEC4F_NEON
+ (
+ ;// n_cst need not be altered
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_sub.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global sub_float_asm
+ .thumb
+ .thumb_func
+
+sub_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t sub_float(arm_vec2f_t * dst,
+ @ arm_float_t * src1, const arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
+ @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
+ @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ cbz r3, .LoopEndFloat
+
+.LoopBeginFloat:
+ vldr s1, [r1] @ Load s1 = src1[i]
+ add r1, r1, #4 @ move to the next entry
+ vldr s2, [r2] @ Load s2 = src2[i]
+ add r2, r2, #4 @ next entry
+ vsub.f32 s10, s1, s2 @ s10 = src1[i] - src2[i]
+ vstr s10, [r0] @ Store the result back into the main memory
+ add r0, r0, #4 @ next entry in the dst
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_sub.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t sub_float_c (ne10_float32_t * dst, ne10_float32_t * src1, ne10_float32_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ] = src1[ itr ] - src2[ itr ];
+ );
+}
+
+ne10_result_t sub_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src1, ne10_vec2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x - src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y - src2[ itr ].y;
+ );
+}
+
+ne10_result_t sub_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src1, ne10_vec3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x - src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y - src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z - src2[ itr ].z;
+ );
+}
+
+ne10_result_t sub_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src1, ne10_vec4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].x = src1[ itr ].x - src2[ itr ].x;
+ dst[ itr ].y = src1[ itr ].y - src2[ itr ].y;
+ dst[ itr ].z = src1[ itr ].z - src2[ itr ].z;
+ dst[ itr ].w = src1[ itr ].w - src2[ itr ].w;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_sub.neon.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+
+
+
+ .align 4
+ .global sub_float_neon
+ .thumb
+ .thumb_func
+
+sub_float_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t sub_float(arm_float_t * dst,
+ @ arm_float_t * src1,
+ @ arm_float_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ cbz r3, .L_check_float
+
+ @ load the 1st set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ vsub.f32 q3, q0, q1 @ q3 = q0 - q1
+
+ ble .L_mainloopend_float
+
+.L_mainloop_float:
+ @ store the result for the current set
+ vst1.32 {d6,d7}, [r0]!
+
+ @ load the next set of values
+ vld1.32 {q0}, [r1]!
+ vld1.32 {q1}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vsub.f32 q3, q0, q1 @ q3 = q0 - q1
+
+ bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
+
+.L_mainloopend_float:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst1.32 {d6,d7}, [r0]!
+
+
+.L_check_float:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_float
+
+.L_secondloop_float:
+ @ process the last few items left in the input array
+ vld1.f32 d0[0], [r1]! @ Fill in d0[0]
+ vld1.f32 d1[0], [r2]! @ Fill in d1[1]
+
+
+ subs r4, r4, #1
+
+ @ values
+ vsub.f32 d0, d0, d1
+
+ vst1.32 {d0[0]}, [r0]!
+
+ bgt .L_secondloop_float
+
+.L_return_float:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global sub_vec2f_neon
+ .thumb
+ .thumb_func
+
+sub_vec2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t sub_float(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src1,
+ @ arm_vec2f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cbz r3, .L_check_vec2
+
+ @ load the 1st set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ vsub.f32 q8, q0, q2
+ vsub.f32 q9, q1, q3
+
+ ble .L_mainloopend_vec2
+
+.L_mainloop_vec2:
+ @ store the result for the current set
+ vst2.32 {d16,d17,d18,d19}, [r0]!
+
+ @ load the next set of values
+ vld2.32 {q0-q1}, [r1]!
+ vld2.32 {q2-q3}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vsub.f32 q8, q0, q2
+ vsub.f32 q9, q1, q3
+
+ bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_vec2:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst2.32 {d16,d17,d18,d19}, [r0]!
+
+.L_check_vec2:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_vec2
+
+.L_secondloop_vec2:
+ @ process the last few items left in the input array
+ vld1.f32 d0, [r1]!
+ vld1.f32 d1, [r2]!
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vsub.f32 d0, d0, d1
+
+ vst1.32 {d0}, [r0]!
+
+ bgt .L_secondloop_vec2
+
+.L_return_vec2:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global sub_vec3f_neon
+ .thumb
+ .thumb_func
+sub_vec3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t sub_float(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src1,
+ @ arm_vec3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r3 = count % 4;
+ sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_vec3
+
+ @ load the 1st set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d18, d20, d22}, [r2]!
+ vld3.32 {d19, d21, d23}, [r2]!
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ vsub.f32 q12, q0, q9
+ vsub.f32 q13, q1, q10
+ vsub.f32 q14, q2, q11
+
+ ble .L_mainloopend_vec3
+
+.L_mainloop_vec3:
+ @ store the result for the current set
+ vst3.32 {d24, d26, d28}, [r0]!
+ vst3.32 {d25, d27, d29}, [r0]!
+
+ @ load the next set of values
+ vld3.32 {d0, d2, d4}, [r1]!
+ vld3.32 {d1, d3, d5}, [r1]!
+ vld3.32 {d18, d20, d22}, [r2]!
+ vld3.32 {d19, d21, d23}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vsub.f32 q12, q0, q9
+ vsub.f32 q13, q1, q10
+ vsub.f32 q14, q2, q11
+
+ bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_vec3:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst3.32 {d24, d26, d28}, [r0]!
+ vst3.32 {d25, d27, d29}, [r0]!
+
+.L_check_vec3:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_vec3
+
+.L_secondloop_vec3:
+ @ process the last few items left in the input array
+ vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, -, - };
+ @ q1 = { V1.y, -, -, - };
+ @ q2 = { V1.z, -, -, - };
+ vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
+ @ q0 = { V1.x, -, V2.x, - };
+ @ q1 = { V1.y, -, V2.y, - };
+ @ q2 = { V1.z, -, V2.z, - };
+
+ subs r4, r4, #1
+
+ @ calculate values for
+ vsub.f32 d0, d0, d1
+ vsub.f32 d2, d2, d3
+ vsub.f32 d4, d4, d5
+
+ vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
+
+ bgt .L_secondloop_vec3
+
+.L_return_vec3:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
+
+
+
+
+ .align 4
+ .global sub_vec4f_neon
+ .thumb
+ .thumb_func
+sub_vec4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t sub_float(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src1,
+ @ arm_vec4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: *src2 & current src2 entry's address
+ @ r3: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r4: the number of items that are left to be processed at the end of
+ @ the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4}
+ and r4, r3, #3 @ r4 = count % 4;
+ sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
+
+ cmp r3, #0
+ beq .L_check_vec4
+
+ @ load the 1st set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+
+ subs r3, r3, #4 @ 4 for this set
+
+ @ calculate values for the 1st set
+ vsub.f32 q12, q0, q8
+ vsub.f32 q13, q1, q9
+ vsub.f32 q14, q2, q10
+ vsub.f32 q15, q3, q11
+
+ ble .L_mainloopend_vec4
+
+.L_mainloop_vec4:
+ @ store the result for the current set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+
+ @ load the next set of values
+ vld4.32 {d0, d2, d4, d6}, [r1]!
+ vld4.32 {d1, d3, d5, d7}, [r1]!
+ vld4.32 {d16, d18, d20, d22}, [r2]!
+ vld4.32 {d17, d19, d21, d23}, [r2]!
+ subs r3, r3, #4
+
+ @ calculate values for the next set
+ vsub.f32 q12, q0, q8
+ vsub.f32 q13, q1, q9
+ vsub.f32 q14, q2, q10
+ vsub.f32 q15, q3, q11
+
+ bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_vec4:
+ @ the last iteration for this call
+ @ store the result for the last set
+ vst4.32 {d24, d26, d28, d30}, [r0]!
+ vst4.32 {d25, d27, d29, d31}, [r0]!
+
+.L_check_vec4:
+ @ check if anything left to process at the end of the input array
+ cmp r4, #0
+ ble .L_return_vec4
+
+.L_secondloop_vec4:
+ @ process the last few items left in the input array
+ vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
+ @ q0 = { V1.x, V1.y, V1.z, V1.w };
+ vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
+ @ q1 = { V2.x, V2.y, V2.z, V2.w };
+
+ subs r4, r4, #1
+
+ @ calculate values
+ vsub.f32 q0, q0, q1
+
+ vst1.32 {d0, d1}, [r0]!
+
+ bgt .L_secondloop_vec4
+
+.L_return_vec4:
+ @ return
+ pop {r4}
+ mov r0, #0
+ bx lr
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_subc.asm.s
+@
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+
+ .balign 4
+ .global subc_float_asm
+ .thumb
+ .thumb_func
+
+subc_float_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t subc_float(arm_vec2f_t * dst,
+ @ arm_float_t * src, const arm_float_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndFloat
+ mov r5, #0
+
+.LoopBeginFloat:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i]
+ vmov s3, r2 @ Get cst into register s3
+ vsub.f32 s10, s1, s3 @ s10 = src[i] - cst
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the result back into the main memory
+ add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginFloat @ Continue if "i < count"
+
+.LoopEndFloat:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global subc_vec2f_asm
+ .thumb
+ .thumb_func
+
+subc_vec2f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t subc_vec2f(arm_vec2f_t * dst,
+ @ arm_vec2f_t * src, const arm_vec2f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec2F
+ mov r5, #0
+
+.LoopBeginVec2F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x and src[i].y
+ vldr s2, [r6, #4]
+ vldr s3, [r2, #0] @ Load cst->x and cst->y
+ vldr s4, [r2, #4]
+ vsub.f32 s10, s1, s3 @ s10 = src[i].x - cst->x
+ vsub.f32 s11, s2, s4 @ s11 = src[i].y - cst->y
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec2F @ Continue if "i < count"
+
+.LoopEndVec2F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global subc_vec3f_asm
+ .thumb
+ .thumb_func
+
+subc_vec3f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t subc_vec3f(arm_vec3f_t * dst,
+ @ arm_vec3f_t * src, const arm_vec3f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec3F
+ mov r5, #0
+
+.LoopBeginVec3F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
+ vldr s5, [r2, #4]
+ vldr s6, [r2, #8]
+ vsub.f32 s10, s1, s4 @ s10 = src[i].x - cst->x
+ vsub.f32 s11, s2, s5 @ s11 = src[i].y - cst->y
+ vsub.f32 s12, s3, s6 @ s12 = src[i].z - cst->z
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec3F @ Continue if "i < count"
+
+.LoopEndVec3F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
+
+
+
+
+ .balign 4
+ .global subc_vec4f_asm
+ .thumb
+ .thumb_func
+
+subc_vec4f_asm:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t subc_vec4f(arm_vec4f_t * dst,
+ @ arm_vec4f_t * src, const arm_vec4f_t * cst,
+ @ unsigned int count)
+ @
+ @ r0: *dst
+ @ r1: *src
+ @ r2: *cst
+ @ r3: int count
+ @
+ @ r3: loop counter
+ @ r5: current item's offset in both src[] and dst[]
+ @ r6: current source item's address made of base(r1)+offset(r5)
+ @ r7: current destination item's address made of base(r0)+offset(r5)
+ @
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ push {r4, r5, r6, r7}
+ cbz r3, .LoopEndVec4F
+ mov r5, #0
+
+.LoopBeginVec4F:
+ add r6, r1, r5 @ Get current source item's address in memory
+ vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
+ vldr s2, [r6, #4]
+ vldr s3, [r6, #8]
+ vldr s4, [r6, #12]
+ vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
+ vldr s6, [r2, #4]
+ vldr s7, [r2, #8]
+ vldr s8, [r2, #12]
+ vsub.f32 s10, s1, s5 @ s10 = src[i].x - cst->x
+ vsub.f32 s11, s2, s6 @ s11 = src[i].y - cst->y
+ vsub.f32 s12, s3, s7 @ s12 = src[i].z - cst->z
+ vsub.f32 s13, s4, s8 @ s13 = src[i].w - cst->w
+ add r7, r0, r5 @ Get current destination item's address in memory
+ vstr s10, [r7, #0] @ Store the results back into the main memory
+ vstr s11, [r7, #4]
+ vstr s12, [r7, #8]
+ vstr s13, [r7, #12]
+ add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
+ subs r3, r3, #1 @ count down using the current index (i--)
+ bne .LoopBeginVec4F @ Continue if "i < count"
+
+.LoopEndVec4F:
+ mov r0, NE10_OK @ Return NE10_OK
+ pop {r4, r5, r6, r7}
+ bx lr
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_subc.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t subc_float_c (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ] = src[ itr ] - cst;
+ );
+}
+
+ne10_result_t subc_vec2f_c (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x - cst->x;
+ dst[ itr ].y = src[ itr ].y - cst->y;
+ );
+}
+
+ne10_result_t subc_vec3f_c (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x - cst->x;
+ dst[ itr ].y = src[ itr ].y - cst->y;
+ dst[ itr ].z = src[ itr ].z - cst->z;
+ );
+}
+
+ne10_result_t subc_vec4f_c (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_X_C
+ (
+ dst[ itr ].x = src[ itr ].x - cst->x;
+ dst[ itr ].y = src[ itr ].y - cst->y;
+ dst[ itr ].z = src[ itr ].z - cst->z;
+ dst[ itr ].w = src[ itr ].w - cst->w;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_subc.neon.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+#include <arm_neon.h>
+
+
+ne10_result_t subc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_FLOAT_NEON
+ (
+ n_dst = vsubq_f32 (n_src , n_cst);
+ ,
+ n_tmp_src = vsub_f32 (n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t subc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC2F_NEON
+ (
+ n_dst = vsubq_f32 (n_src , n_cst);
+ ,
+ n_tmp_src = vsub_f32 (n_tmp_src, n_tmp_cst);
+ );
+}
+
+ne10_result_t subc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC3F_NEON
+ (
+ n_dst1 = vsubq_f32 (n_src1 , n_cst1);
+ n_dst2 = vsubq_f32 (n_src2 , n_cst2);
+ n_dst3 = vsubq_f32 (n_src3 , n_cst3);
+ ,
+ n_tmp_src.val[0] = vsub_f32 (n_tmp_src.val[0], n_tmp_cst.val[0]);
+ n_tmp_src.val[1] = vsub_f32 (n_tmp_src.val[1], n_tmp_cst.val[1]);
+ n_tmp_src.val[2] = vsub_f32 (n_tmp_src.val[2], n_tmp_cst.val[2]);
+ );
+}
+
+ne10_result_t subc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
+{
+ NE10_XC_OPERATION_VEC4F_NEON
+ (
+ n_dst = vsubq_f32 (n_src , n_cst);
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_submat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_submat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+
+#include <assert.h>
+
+ne10_result_t submat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = src1[ itr ].c1.r1 - src2[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src1[ itr ].c1.r2 - src2[ itr ].c1.r2;
+
+ dst[ itr ].c2.r1 = src1[ itr ].c2.r1 - src2[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = src1[ itr ].c2.r2 - src2[ itr ].c2.r2;
+ );
+}
+
+ne10_result_t submat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = src1[ itr ].c1.r1 - src2[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src1[ itr ].c1.r2 - src2[ itr ].c1.r2;
+ dst[ itr ].c1.r3 = src1[ itr ].c1.r3 - src2[ itr ].c1.r3;
+
+ dst[ itr ].c2.r1 = src1[ itr ].c2.r1 - src2[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = src1[ itr ].c2.r2 - src2[ itr ].c2.r2;
+ dst[ itr ].c2.r3 = src1[ itr ].c2.r3 - src2[ itr ].c2.r3;
+
+ dst[ itr ].c3.r1 = src1[ itr ].c3.r1 - src2[ itr ].c3.r1;
+ dst[ itr ].c3.r2 = src1[ itr ].c3.r2 - src2[ itr ].c3.r2;
+ dst[ itr ].c3.r3 = src1[ itr ].c3.r3 - src2[ itr ].c3.r3;
+ );
+}
+
+ne10_result_t submat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count)
+{
+ NE10_X_OPERATION_FLOAT_C
+ (
+ dst[ itr ].c1.r1 = src1[ itr ].c1.r1 - src2[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src1[ itr ].c1.r2 - src2[ itr ].c1.r2;
+ dst[ itr ].c1.r3 = src1[ itr ].c1.r3 - src2[ itr ].c1.r3;
+ dst[ itr ].c1.r4 = src1[ itr ].c1.r4 - src2[ itr ].c1.r4;
+
+ dst[ itr ].c2.r1 = src1[ itr ].c2.r1 - src2[ itr ].c2.r1;
+ dst[ itr ].c2.r2 = src1[ itr ].c2.r2 - src2[ itr ].c2.r2;
+ dst[ itr ].c2.r3 = src1[ itr ].c2.r3 - src2[ itr ].c2.r3;
+ dst[ itr ].c2.r4 = src1[ itr ].c2.r4 - src2[ itr ].c2.r4;
+
+ dst[ itr ].c3.r1 = src1[ itr ].c3.r1 - src2[ itr ].c3.r1;
+ dst[ itr ].c3.r2 = src1[ itr ].c3.r2 - src2[ itr ].c3.r2;
+ dst[ itr ].c3.r3 = src1[ itr ].c3.r3 - src2[ itr ].c3.r3;
+ dst[ itr ].c3.r4 = src1[ itr ].c3.r4 - src2[ itr ].c3.r4;
+
+ dst[ itr ].c4.r1 = src1[ itr ].c4.r1 - src2[ itr ].c4.r1;
+ dst[ itr ].c4.r2 = src1[ itr ].c4.r2 - src2[ itr ].c4.r2;
+ dst[ itr ].c4.r3 = src1[ itr ].c4.r3 - src2[ itr ].c4.r3;
+ dst[ itr ].c4.r4 = src1[ itr ].c4.r4 - src2[ itr ].c4.r4;
+ );
+}
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NE10_types.h"
+
+ne10_result_t submat_2x2f_neon (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src1, ne10_mat2x2f_t * src2, ne10_uint32_t count)
+{
+ return sub_vec2f_neon ( (ne10_vec2f_t*) dst, (ne10_vec2f_t*) src1, (ne10_vec2f_t*) src2, count * 2);
+}
+
+ne10_result_t submat_3x3f_neon (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src1, ne10_mat3x3f_t * src2, ne10_uint32_t count)
+{
+ return sub_vec3f_neon ( (ne10_vec3f_t*) dst, (ne10_vec3f_t*) src1, (ne10_vec3f_t*) src2, count * 3);
+}
+
+ne10_result_t submat_4x4f_neon (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src1, ne10_mat4x4f_t * src2, ne10_uint32_t count)
+{
+ return sub_vec4f_neon ( (ne10_vec4f_t*) dst, (ne10_vec4f_t*) src1, (ne10_vec4f_t*) src2, count * 4);
+}
+
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_transmat.asm.s
+@
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NE10 Library : math/NE10_addmat.c
+ */
+
+#include "NE10_types.h"
+#include "macros.h"
+#include <math.h>
+
+#include <assert.h>
+
+inline void swap (ne10_float32_t *a, ne10_float32_t *b)
+{
+ ne10_float32_t tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+ne10_result_t transmat_2x2f_c (ne10_mat2x2f_t * dst, ne10_mat2x2f_t * src, ne10_uint32_t count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ].c1.r1 = src[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src[ itr ].c2.r1;
+ dst[ itr ].c2.r1 = src[ itr ].c1.r2;
+ dst[ itr ].c2.r2 = src[ itr ].c2.r2;
+ );
+}
+
+ne10_result_t transmat_3x3f_c (ne10_mat3x3f_t * dst, ne10_mat3x3f_t * src, ne10_uint32_t count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ].c1.r1 = src[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src[ itr ].c2.r1;
+ dst[ itr ].c1.r3 = src[ itr ].c3.r1;
+
+ dst[ itr ].c2.r1 = src[ itr ].c1.r2;
+ dst[ itr ].c2.r2 = src[ itr ].c2.r2;
+ dst[ itr ].c2.r3 = src[ itr ].c3.r2;
+
+ dst[ itr ].c3.r1 = src[ itr ].c1.r3;
+ dst[ itr ].c3.r2 = src[ itr ].c2.r3;
+ dst[ itr ].c3.r3 = src[ itr ].c3.r3;
+ );
+}
+
+ne10_result_t transmat_4x4f_c (ne10_mat4x4f_t * dst, ne10_mat4x4f_t * src, ne10_uint32_t count)
+{
+ NE10_DETMAT_OPERATION_X_C
+ (
+ dst[ itr ].c1.r1 = src[ itr ].c1.r1;
+ dst[ itr ].c1.r2 = src[ itr ].c2.r1;
+ dst[ itr ].c1.r3 = src[ itr ].c3.r1;
+ dst[ itr ].c1.r4 = src[ itr ].c4.r1;
+
+ dst[ itr ].c2.r1 = src[ itr ].c1.r2;
+ dst[ itr ].c2.r2 = src[ itr ].c2.r2;
+ dst[ itr ].c2.r3 = src[ itr ].c3.r2;
+ dst[ itr ].c2.r4 = src[ itr ].c4.r2;
+
+ dst[ itr ].c3.r1 = src[ itr ].c1.r3;
+ dst[ itr ].c3.r2 = src[ itr ].c2.r3;
+ dst[ itr ].c3.r3 = src[ itr ].c3.r3;
+ dst[ itr ].c3.r4 = src[ itr ].c4.r3;
+
+ dst[ itr ].c4.r1 = src[ itr ].c1.r4;
+ dst[ itr ].c4.r2 = src[ itr ].c2.r4;
+ dst[ itr ].c4.r3 = src[ itr ].c3.r4;
+ dst[ itr ].c4.r4 = src[ itr ].c4.r4;
+ );
+}
--- /dev/null
+@
+@ Copyright 2011-12 ARM Limited
+@
+@ Licensed under the Apache License, Version 2.0 (the "License");
+@ you may not use this file except in compliance with the License.
+@ You may obtain a copy of the License at
+@
+@ http://www.apache.org/licenses/LICENSE-2.0
+@
+@ Unless required by applicable law or agreed to in writing, software
+@ distributed under the License is distributed on an "AS IS" BASIS,
+@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ See the License for the specific language governing permissions and
+@ limitations under the License.
+@
+
+@
+@ NE10 Library : math/NE10_transmat.neon.s
+@
+
+
+
+
+ .text
+ .syntax unified
+
+.include "NE10header.s"
+.include "NE10_detmat.neon.inc.s"
+
+
+
+
+ .balign 4
+ .global transmat_2x2f_neon
+ .thumb
+ .thumb_func
+
+transmat_2x2f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t transmat_2x2f(arm_mat2x2f_t * dst,
+ @ arm_mat2x2f_t * src,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat2x2
+
+.L_mainloop_mat2x2:
+
+ subs r2, r2, #4
+
+ vld4.32 {d16, d18, d20, d22}, [r1]!
+ vld4.32 {d17, d19, d21, d23}, [r1]!
+
+ vswp q9, q10
+
+ vst4.32 {d16, d18, d20, d22}, [r0]!
+ vst4.32 {d17, d19, d21, d23}, [r0]!
+
+ bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
+
+.L_mainloopend_mat2x2:
+
+.L_check_mat2x2:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat2x2
+
+.L_secondloop_mat2x2:
+ @ process the last few items left in the input array
+ vld4.32 {d16[0], d18[0], d20[0], d22[0]}, [r1]!
+
+ vswp d18, d20
+
+ subs r3, r3, #1
+
+ vst4.32 {d16[0], d18[0], d20[0], d22[0]}, [r0]!
+
+ bgt .L_secondloop_mat2x2
+
+.L_return_mat2x2:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the inverse of two 3x3 marices
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_3x3TRNMATS
+ @ rearrange the results for use in a "vst3" instruction...
+ vtrn.32 q8 , q11
+ vtrn.32 q9 , q12
+ vtrn.32 q10, q13
+
+ vst3.32 { d16 , d18 , d20 }, [r0]!
+ vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
+ vst3.32 { d22 , d24 , d26 }, [r0]!
+ vst3.32 { d23[0], d25[0], d27[0]}, [r0]!
+ .endm
+
+
+
+
+ .align 2
+ .global transmat_3x3f_neon
+ .thumb
+ .thumb_func
+transmat_3x3f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t transmat_3x3f(arm_mat3x3f_t * dst,
+ @ arm_mat3x3f_t * src1,
+ @ arm_mat3x3f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r2 = count % 4;
+ sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat3x3
+
+.L_mainloop_mat3x3:
+ LOAD_3x3MATS_ARGS d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, q8, q9, q10, q11, q12, q13, r1
+
+ subs r2, r2, #2
+
+ vswp d20, d17
+ vswp d22, d18
+ vswp d26, d19
+
+ STORE_3x3TRNMATS
+
+ bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
+
+.L_mainloopend_mat3x3:
+
+.L_check_mat3x3:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat3x3
+
+.L_secondloop_mat3x3:
+ @ process the last few items left in the input array
+ @ load the next (e.g. 3rd) set of values
+ vld3.32 { d16 , d18 , d20 }, [r1]!
+ vld3.32 { d17[0], d19[0], d21[0]}, [r1]!
+
+ vtrn.32 q8 , q11
+ vtrn.32 q9 , q12
+ vtrn.32 q10, q13
+
+ subs r3, r3, #1
+
+ vswp d20, d17
+ vswp d22, d18
+ vswp d26, d19
+
+
+
+ @ store the result for the last (e.g. 3rd) set
+ vtrn.32 q8 , q11
+ vtrn.32 q9 , q12
+ vtrn.32 q10, q13
+
+ vst3.32 { d16 , d18 , d20 }, [r0]!
+ vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
+
+ bgt .L_secondloop_mat3x3
+
+.L_return_mat3x3:
+ @ return
+ mov r0, #0
+ bx lr
+
+
+
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ This macro calculates the inverse of two 4x4 marices
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ .macro STORE_4x4INVMATS
+ @ rearrange the results for use in a "vst3" instruction...
+ vtrn.32 q8, q12
+ vtrn.32 q9, q13
+ vtrn.32 q10, q14
+ vtrn.32 q11, q15
+
+ vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
+ vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
+ vst4.32 { d24 , d26 , d28 , d30 }, [r0]!
+ vst4.32 { d25 , d27 , d29 , d31 }, [r0]!
+ .endm
+
+
+
+
+ .align 2
+ .global transmat_4x4f_neon
+ .thumb
+ .thumb_func
+transmat_4x4f_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @
+ @ arm_result_t transmat_4x4f(arm_mat4x4f_t * dst,
+ @ arm_mat4x4f_t * src1,
+ @ arm_mat4x4f_t * src2,
+ @ unsigned int count)
+ @
+ @ r0: *dst & current dst entry's address
+ @ r1: *src1 & current src1 entry's address
+ @ r2: int count & the number of items in the input array that can be
+ @ processed in chunks of 4 vectors
+ @
+ @ r3: the number of items that are left to be processed at the end
+ @ of the input array
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+ and r3, r2, #3 @ r3 = count % 4;
+ sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
+
+ cmp r2, #0
+ beq .L_check_mat4x4
+
+.L_mainloop_mat4x4:
+
+ LOAD_4x4MATS_ARGS d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, q8, q9, q10, q11, q12, q13, q14, q15, r1
+
+
+ subs r2, r2, #2
+
+ vswp d18, d24
+ vswp d17, d20
+ vswp d22, d25
+ vswp d19, d28
+ vswp d27, d30
+ vswp d23, d29
+
+
+ STORE_4x4INVMATS
+
+ bgt .L_mainloop_mat4x4 @ loop if r2 > 0, if we have at least another 4 vectors (16 floats) to process
+
+.L_mainloopend_mat4x4:
+
+.L_check_mat4x4:
+ @ check if anything left to process at the end of the input array
+ cmp r3, #0
+ ble .L_return_mat4x4
+
+.L_secondloop_mat4x4:
+ @ process the last few items left in the input array
+ vld4.32 { d16 , d18 , d20 , d22 }, [r1]!
+ vld4.32 { d17 , d19 , d21 , d23 }, [r1]!
+
+ vtrn.32 q8, q12
+ vtrn.32 q9, q13
+ vtrn.32 q10, q14
+ vtrn.32 q11, q15
+
+ subs r3, r3, #1
+
+ vswp d18, d24
+ vswp d17, d20
+ vswp d22, d25
+ vswp d19, d28
+ vswp d27, d30
+ vswp d23, d29
+
+
+ @ store the results
+ vtrn.32 q8, q12
+ vtrn.32 q9, q13
+ vtrn.32 q10, q14
+ vtrn.32 q11, q15
+
+ vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
+ vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
+
+
+ bgt .L_secondloop_mat4x4
+
+.L_return_mat4x4:
+ @ return
+ mov r0, #0
+ bx lr
+
+++ /dev/null
-#!/usr/bin/env perl
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# NE10 Library : nightly.pl
-#
-
-use warnings;
-use strict;
-
-# other variables
-my $iterations = 2000; # how many iterations each test must go through ?
-my $files_list=`cat projectfile | tr '\n' ';'`; #`find . -maxdepth 1 -type f -iname *.prj | grep './'`; # units to be built are listed in "projectfile"
-my @files = split( /;/, $files_list ); # array of files to be built
-my @built; # list of successfully built units
-my @failed; # list of units that failed to build
-my @warn; # list of units that did build but returned with an error message or too many warnings
-
-my $units_count = 0;
-my $units_succeeded = 0;
-my $success_percentage = 0;
-
-# get list of units and build them
-foreach my $fl (@files) {
- $units_count ++;
- #print "<".$fl.">\n"; # debug print
- my $make_cmd = "make NE10_$fl.test_r.ex";
- system ( $make_cmd );
- if ( $? != 0 )
- {
- # failed to build
- push(@failed, $fl);
- }
- else
- {
- # built successfully...
- push(@built, $fl);
- $units_succeeded ++;
- }
-
-}
-
-$success_percentage = 100 * $units_succeeded / $units_count;
-
-
-#get a test log to be stored in the "test_index_tbl"
-system ( "./getlog.sh > ./testlog.txt" );
-my $platform = `echo \$NE10PLATFORM`;
-my $syslog = `cat ./testlog.txt`;
-my $testlog; # this will keep the summary text that will be stored in the database
-my $ACCEPTABLE_WARNS = 10; # note: this is defined in unit_test_common.h
-
-# try and run perf on the successfully built units
-
-foreach my $success (@built)
-{
- my $perf_cmd = "./runperf.sh NE10_$success $iterations";
- system ( $perf_cmd );
- if ( ($? < 0) || ($? > $ACCEPTABLE_WARNS) )
- {
- # an error while running the test
- push(@warn, $success);
- }
-}
-
-# print out a summary of this run
-if (scalar(@failed) == 0) {
- print "** No Build Failures\n";
-} else {
- print "** BUILDS FAILED\n";
- for my $fail (@failed) {
- print " $fail failed to build\n";
- }
-}
-if (scalar(@warn) == 0) {
- print "** No Test Failures\n";
-} else {
- print "** TESTS FAILED!\n";
- for my $warned (@warn) {
- print " $warned failed test\n";
- }
-}
-#print ( $testlog );
+++ /dev/null
-addc
-subc
-rsbc
-mulc
-divc
-mlac
-setc
-add
-sub
-mul
-div
-mla
-abs
-len
-normalize
-dot
-cross
-addmat
-submat
-mulmat
-mulcmatvec
-detmat
-invmat
-transmat
-identitymat
+++ /dev/null
-#!/bin/sh
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# NE10 Library : removetabs.sh
-#
-#!/bin/bash
-
-# This script removes tab characters in files and replaces them with
-# the right number of spaces. It also removes trailing whitespaces.
-
-# remove trailing whitespaces
-LSw=`grep -lsri --exclude="Makefile" --exclude-dir=".git" '\s$' .`;
-for flw in $LSw
-do
- echo "HAS SPACES: " $flw; # just to see a list of the files that include unwanted tabs
- perms=`stat -c '%a' $flw`;
- sed 's/[ \t]*$//gi' $flw > .exp.tmp;
- sync;
- # rename the file to the original file
- mv .exp.tmp $flw;
- chmod $perms $flw;
- sync;
-done
-
-# remove tabs
-chtab=$'\t'; # only works in bash but not in sh
-LSt=`grep -lrsi --exclude="Makefile" --exclude-dir=".git" "$chtab" .`;
-for flt in $LSt
-do
- echo "HAS TABS: " $flt; # just to see a list of the files that include unwanted tabs
- perms=`stat -c '%a' $flt`;
- # remove tabs
- expand $flt > .exp.tmp;
- sync;
- # rename the file to the original file
- mv .exp.tmp $flt;
- chmod $perms $flt;
- sync;
-done
-
+++ /dev/null
-#!/bin/sh
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# NE10 Library : review.sh
-#
-
-BRANCH=$1
-
-BASE=${2-"master"}
-
-if [ "$BRANCH" = "" ]; then
- echo "Usage: review.sh <branch to review> [parent branch]"
- exit
-else
-
- LABEL=`echo $1 | perl -pe '$_ =~ /dev\/([a-zA-Z0-9]+)\/(.+)/;$_=$2'`
- GLUSER=`echo $1 | perl -pe '$_ =~ /dev\/([a-zA-Z0-9]+)\/(.+)/;$_=$1'`
-
- NEWBRANCH="staging/$GLUSER/$LABEL"
-
- echo "Pushing $BRANCH from $BASE for review as $NEWBRANCH"
-
- git branch $NEWBRANCH $BASE
- git push origin $NEWBRANCH
- git checkout $NEWBRANCH
- git rebase $BRANCH
- git push origin $NEWBRANCH
-
-fi
-
+++ /dev/null
-#!/bin/sh
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# NE10 Library : runperf.sh
-#
-
-# NOTE: the following string comparisons differ between BASH and SH
-if [ ! -n "$1" ]; then exit; fi
-if [ ! -n "$2" ]; then exit; fi
-if [ ! -e "./$1.test_r.ex" ]; then exit; fi
-./$1.test_r.ex 0
-OP_COUNT=$?
-IMPL_COUNT=3
-ITERATIONS=$2
-PERF_CMD="perf stat -e cycles,instructions,cache-references,cache-misses,branches,branch-misses,bus-cycles,cpu-clock,task-clock,faults,minor-faults,major-faults,context-switches,migrations,alignment-faults,emulation-faults -x,"
-rm res_*_$1_*.txt
-for o in $(seq $OP_COUNT)
-do
- ./$1.test_r.ex $o 0 $ITERATIONS
- RET=$?
- if [ "$RET" -ne "0" ]; then
- echo " SEND MAIL ~~ ERROR: Unit [$1] operation [$o] has returned with error code $RET...";
- #continue; # if one of the operations in a unit has a mismatching implementation it doesnt mean that all other op's would do too
- # dont skip the operation, try different implementations
- if [ "$RET" -eq "10" ]; then
- exit $RET;
- fi
- fi
- for i in $(seq $IMPL_COUNT)
- do
- ./$1.test_r.ex $o $i $ITERATIONS 1>/dev/null 2>/dev/null
- RET=$?
- if [ "$RET" -ne "0" ]; then
- echo "ERROR;./$1.test_r.ex $o $i $ITERATIONS $RET"
- exit $RET;
- else
- STDOUT_FILE="res_std_"$1_$o"_"$i"_"$ITERATIONS".txt";
- STDERR_FILE="res_err_"$1_$o"_"$i"_"$ITERATIONS".txt";
-# Uncomment and use the following three lines if you would like to see the output from perf
-# echo "$STDOUT_FILE" > $STDOUT_FILE;
-# echo "$STDERR_FILE" > $STDERR_FILE;
-# $PERF_CMD ./$1.test_r.ex $o $i $ITERATIONS 1>>$STDOUT_FILE 2>>$STDERR_FILE;
- fi
- done
-done
--- /dev/null
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# header
+include_directories (
+ ${PROJECT_SOURCE_DIR}/inc
+)
+
+if(NE10_BUILD_SHARED)
+ add_executable(NE10_test_dynamic NE10_test.c)
+ target_link_libraries (
+ NE10_test_dynamic
+ NE10_test
+ m
+ )
+endif()
+
+if(NE10_BUILD_STATIC)
+ add_executable(NE10_test_static NE10_test.c)
+ target_link_libraries (
+ NE10_test_static
+ NE10
+ m
+ )
+endif()
+
+
--- /dev/null
+/*
+ * Copyright 2011-12 ARM Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NE10.h"
+#include "NE10_init.h"
+
+// This test code shows you how you can statically embed NE10 in your code
+
+void main()
+{
+ printf ("Going to initialze NE10...\n");
+ NE10_init();
+ printf ("NE10 has been initialized.\n");
+}
+
+++ /dev/null
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-# Define C files.
-set(NE10_C_SRCS
- NE10_abs.c
- NE10_addc.c
- NE10_add.c
- NE10_divc.c
- NE10_div.c
- NE10_len.c
- NE10_mlac.c
- NE10_mla.c
- NE10_mulc.c
- NE10_mul.c
- NE10_normalize.c
- NE10_rsbc.c
- NE10_setc.c
- NE10_subc.c
- NE10_sub.c
- NE10_dot.c
- NE10_cross.c
- NE10_addmat.c
- NE10_submat.c
- NE10_mulmat.c
- NE10_mulcmatvec.c
- NE10_detmat.c
- NE10_invmat.c
- NE10_transmat.c
- NE10_identitymat.c
-)
-
-# Define intrinsic NEON files.
-set(NE10_INTRINSIC_SRCS
- NE10_addc.neon.c
- NE10_divc.neon.c
- NE10_mlac.neon.c
- NE10_mulc.neon.c
- NE10_rsbc.neon.c
- NE10_setc.neon.c
- NE10_subc.neon.c
- NE10_addmat.neon.c
- NE10_submat.neon.c
-)
-
-# Tell CMake these files need to be compiled with "-mfpu=neon"
-foreach(intrinsic_file ${NE10_INTRINSIC_SRCS})
- set_source_files_properties(${intrinsic_file} PROPERTIES COMPILE_FLAGS "-mfpu=neon" )
-endforeach(intrinsic_file)
-
-# Define NEON files.
-set(NE10_NEON_SRCS
- NE10_abs.neon.s
- NE10_add.neon.s
- NE10_div.neon.s
- NE10_len.neon.s
- NE10_mla.neon.s
- NE10_mul.neon.s
- NE10_normalize.neon.s
- NE10_sub.neon.s
- NE10_dot.neon.s
- NE10_cross.neon.s
- NE10_mulmat.neon.s
- NE10_mulcmatvec.neon.s
- NE10_detmat.neon.s
- NE10_invmat.neon.s
- NE10_transmat.neon.s
- NE10_identitymat.neon.s
- NE10_detmat.neon.inc.s
-)
-
-# Tell CMake these files need to go to the C compiler
-set(FLAGS "-mfpu=neon -Wa,-I../../inc -Wa,-I../../headers -Wa,-I../../ -Wa,-I../../source" )
-foreach(neon_file ${NE10_NEON_SRCS})
- set_property (SOURCE ${neon_file} PROPERTY LANGUAGE C)
- set_source_files_properties(
- ${neon_file} PROPERTIES COMPILE_FLAGS
- ${FLAGS}
- )
-endforeach(neon_file)
-
-# Define init files.
-set(NE10_INIT_SRCS
- ../NE10_init.c
-)
-# Define test files.
-set(NE10_TEST_SRCS
- NE10_abs_test.c
- NE10_addc_test.c
- NE10_add_test.c
- NE10_divc_test.c
- NE10_div_test.c
- NE10_len_test.c
- NE10_mlac_test.c
- NE10_mla_test.c
- NE10_mulc_test.c
- NE10_mul_test.c
- NE10_normalize_test.c
- NE10_rsbc_test.c
- NE10_setc_test.c
- NE10_subc_test.c
- NE10_sub_test.c intrinsic_file
- NE10_dot_test.c
- NE10_cross_test.c
- NE10_addmat_test.c
- NE10_submat_test.c
- NE10_mulmat_test.c
- NE10_mulcmatvec_test.c
- NE10_detmat_test.c
- NE10_invmat_test.c
- NE10_transmat_test.c
- NE10_identitymat_test.c
-)
-
-include_directories (
- ../inc
- ../headers
- ../
- ../source
-)
-
-if(NE10_BUILD_STATIC)
- add_library( NE10 STATIC
- ${NE10_C_SRCS}
- ${NE10_INTRINSIC_SRCS}
- ${NE10_NEON_SRCS}
- ${NE10_INIT_SRCS}
- )
- set_target_properties(NE10 PROPERTIES
- CLEAN_DIRECT_OUTPUT 1
- VERSION ${NE10_VERSION}
- )
-endif()
-
-if(NE10_BUILD_SHARED)
-
- add_library( NE10_shared SHARED
- ${NE10_C_SRCS}
- ${NE10_INTRINSIC_SRCS}
- ${NE10_NEON_SRCS}
- ${NE10_INIT_SRCS}
- )
-
- set_target_properties(NE10_shared PROPERTIES
- OUTPUT_NAME "NE10"
- CLEAN_DIRECT_OUTPUT 1
- VERSION ${NE10_VERSION}
- )
-
- add_library( NE10_test SHARED
- ${NE10_C_SRCS}
- ${NE10_INTRINSIC_SRCS}
- ${NE10_NEON_SRCS}
- ${NE10_INIT_SRCS}
- )
-
- set_target_properties(NE10_test PROPERTIES
- OUTPUT_NAME "NE10_test"
- CLEAN_DIRECT_OUTPUT 1
- VERSION ${NE10_VERSION}
- )
-
-endif()
-
-# install libraries
-#if(NE10_BUILD_SHARED)
-# install(TARGETS NE10_shared DESTINATION lib)
-#endif()
-#if(NE10_BUILD_STATIC)
-# install(TARGETS NE10 DESTINATION lib)
-#endif()
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_abs.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global abs_float_asm
- .thumb
- .thumb_func
-
-abs_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t abs_float(arm_float_t * dst,
- @ arm_float_t * src,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndFloat
- mov r3, #0
- vmov s2, r3
-
-.LoopBeginFloat:
- vldr s1, [r1] @ Load s1 = src[i]
- add r1, r1, #4 @ move to the next item
- vabs.f32 s1, s1 @ get the absolute value; s1 = abs(s1 - 0)
- vstr s1, [r0] @ Store it back into the main memory; dst[i] = s1
- add r0, r0, #4 @ move to the next entry
- subs r2, r2, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_abs.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-#include <math.h>
-
-arm_result_t abs_float_c(arm_float_t * dst, arm_float_t * src, unsigned int count)
-{
- NE10_ABS_OPERATION_X_C
- (
- dst[itr] = fabs( src[itr] );
- );
-}
-
-arm_result_t abs_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count)
-{
- NE10_ABS_OPERATION_X_C
- (
- dst[ itr ].x = fabs( src[ itr ].x );
- dst[ itr ].y = fabs( src[ itr ].y );
- );
-}
-
-arm_result_t abs_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count)
-{
- NE10_ABS_OPERATION_X_C
- (
- dst[ itr ].x = fabs( src[ itr ].x );
- dst[ itr ].y = fabs( src[ itr ].y );
- dst[ itr ].z = fabs( src[ itr ].z );
- );
-}
-
-arm_result_t abs_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count)
-{
- NE10_ABS_OPERATION_X_C
- (
- dst[ itr ].x = fabs( src[ itr ].x );
- dst[ itr ].y = fabs( src[ itr ].y );
- dst[ itr ].z = fabs( src[ itr ].z );
- dst[ itr ].w = fabs( src[ itr ].w );
- );
-}
-
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_abs.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global abs_float_neon
- .thumb
- .thumb_func
-
-abs_float_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t abs_float(arm_float_t * dst,
- @ arm_float_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are residual that will be processed at the begin of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
- asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
-
- cbz r3, .L_check_mainloop_float
-
-.L_residualloop_float:
- @ process the residual items in the input array
- vld1.f32 d0[0], [r1]! @ Fill in d0 = { V.x, 0 };
-
- subs r3, r3, #1
-
- @ absolute values
- vabs.f32 d0, d0
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_residualloop_float
-
-.L_check_mainloop_float:
- cbz r2, .L_return_float
-
- @ load the current set of values
- vld1.32 {q0}, [r1]! @ for current set
-
-.L_mainloop_float:
- @ absolute values of the current set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
-
- @ store the result for the current set
- vst1.32 {d6,d7}, [r0]!
-
- subs r2, r2, #1
-
- @ load the next set
- vld1.32 {q0}, [r1]!
-
- bgt .L_mainloop_float @ loop if r2 > 0, if we have another 4 floats
-
-.L_return_float:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global abs_vec2f_neon
- .thumb
- .thumb_func
-
-abs_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t abs_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are residual that will be processed at the begin of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
- asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
-
- cbz r3, .L_check_mainloop_vec2
-
-.L_residualloop_vec2:
- @ process the residual items in the input array
- vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
-
- subs r3, r3, #1
-
- @ absolute values
- vabs.f32 d0, d0
-
- vst1.32 {d0}, [r0]!
-
- bgt .L_residualloop_vec2
-
-.L_check_mainloop_vec2:
- cbz r2, .L_return_vec2
-
- @ load the current set of values
- vld2.32 {q0-q1}, [r1]! @ for current set
-
-.L_mainloop_vec2:
- @ absolute values of the current set
- vabs.f32 q3, q0 @ q3 = abs( q0 )
- vabs.f32 q4, q1 @ q4 = abs( q1 )
-
- @ store the result for the current set
- vst2.32 {d6,d7,d8,d9}, [r0]!
-
- subs r2, r2, #1
-
- @ load the next set
- vld2.32 {q0-q1}, [r1]!
-
- bgt .L_mainloop_vec2 @ loop if r2 > 0, if we have another 4 vec2s
-
-.L_return_vec2:
- @ return
- mov r0, #0
- bx lr
-
-
- .align 4
- .global abs_vec3f_neon
- .thumb
- .thumb_func
-abs_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t abs_vec3f(arm_vec3t_t * dst,
- @ arm_vec3f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are residual that will be processed at the begin of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
- asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
-
- cbz r3, .L_check_mainloop_vec3
-
-.L_residualloop_vec3:
- @ process the residual items in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, -, -, - };
- @ q1 = { V.y, -, -, - };
- @ q2 = { V.z, -, -, - };
- subs r3, r3, #1
-
- @ absolute values
- vabs.f32 d0, d0
- vabs.f32 d2, d2
- vabs.f32 d4, d4
-
- vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
-
- bgt .L_residualloop_vec3
-
-.L_check_mainloop_vec3:
- cbz r2, .L_return_vec3
-
- @ load the current set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]! @ for current set
-
-.L_mainloop_vec3:
- @ absolute values of the current set
- vabs.f32 q5, q0
- vabs.f32 q6, q1
- vabs.f32 q7, q2
-
- @ store the result for the current set
- vst3.32 {d10, d12, d14}, [r0]!
- vst3.32 {d11, d13, d15}, [r0]!
-
- subs r2, r2, #1
-
- @ load the next set
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]! @ for next set
-
- bgt .L_mainloop_vec3 @ loop if r2 > 0, if we have another 4 vec3s
-
-.L_return_vec3:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global abs_vec4f_neon
- .thumb
- .thumb_func
-abs_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t abs_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are residual that will be processed at the begin of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4; calculate the residual loop
- asr r2, r2, #2 @ r2 = count >> 2; calculate the main loop
-
- cbz r3, .L_check_mainloop_vec4
-
-.L_residualloop_vec4:
- @ process the residual items in the input array
- vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, V.y, V.z, V.w };
- subs r3, r3, #1
-
- @ absolute values
- vabs.f32 q0, q0
-
- vst1.32 {d0, d1}, [r0]!
-
- bgt .L_residualloop_vec4
-
-.L_check_mainloop_vec4:
- cbz r2, .L_return_vec4
-
- @ load the current set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]! @ for current set
-
-.L_mainloop_vec4:
- @ absolute values of the current set
- vabs.f32 q10, q0
- vabs.f32 q11, q1
- vabs.f32 q12, q2
- vabs.f32 q13, q3
-
- @ store the result for the current set
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
- subs r2, r2, #1
-
- @ load the next set
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]! @ for next set
-
- bgt .L_mainloop_vec4 @ loop if r2 > 0, if we have another 4 vec4s
-
-.L_return_vec4:
- @ return
- mov r0, #0
- bx lr
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_abs_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_abs_operation_x.h"
-
-extern arm_result_t abs_float_c (arm_float_t * dst, arm_float_t * src, unsigned int count);
-//extern arm_result_t abs_float_asm (arm_float_t * dst, arm_float_t * src, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-extern arm_result_t abs_float_neon(arm_float_t * dst, arm_float_t * src, unsigned int count);
-
-extern arm_result_t abs_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-//extern arm_result_t abs_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-extern arm_result_t abs_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-
-extern arm_result_t abs_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-//extern arm_result_t abs_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t abs_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-
-extern arm_result_t abs_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-//extern arm_result_t abs_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t abs_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) abs_float_c;
- ftbl[ 1] = (arm_func_3args_t) abs_float_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_3args_t) abs_float_neon;
-
- ftbl[ 3] = (arm_func_3args_t) abs_vec2f_c;
- ftbl[ 4] = (arm_func_3args_t) abs_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_3args_t) abs_vec2f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) abs_vec3f_c;
- ftbl[ 7] = (arm_func_3args_t) abs_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_3args_t) abs_vec3f_neon;
-
- ftbl[ 9] = (arm_func_3args_t) abs_vec4f_c;
- ftbl[10] = (arm_func_3args_t) abs_vec4f_c; // using the c version in place of the assembly version
- ftbl[11] = (arm_func_3args_t) abs_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_add.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global add_float_asm
- .thumb
- .thumb_func
-
-add_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t add_float(arm_vec2f_t * dst,
- @ arm_float_t * src1, const arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
- @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
- @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
- @ r3: int count
- @
- @ r3: loop counter
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r3, .LoopEndFloat
-
-.LoopBeginFloat:
- vldr s1, [r1] @ Load s1 = src1[i]
- add r1, r1, #4 @ move to the next entry
- vldr s2, [r2] @ Load s2 = src2[i]
- add r2, r2, #4 @ next entry
- vadd.f32 s10, s1, s2 @ s10 = src1[i] * src2[i]
- vstr s10, [r0] @ Store the result back into the main memory
- add r0, r0, #4 @ next entry in the dst
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_add.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t add_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ] = src1[ itr ] + src2[ itr ];
- );
-}
-
-arm_result_t add_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x + src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y + src2[ itr ].y;
- );
-}
-
-arm_result_t add_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x + src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y + src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z + src2[ itr ].z;
- );
-}
-
-arm_result_t add_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x + src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y + src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z + src2[ itr ].z;
- dst[ itr ].w = src1[ itr ].w + src2[ itr ].w;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_add.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global add_float_neon
- .thumb
- .thumb_func
-
-add_float_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t add_float(arm_float_t * dst,
- @ arm_float_t * src1,
- @ arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_float
-
-.L_residualloop_float:
- @ process the residual items in the input array
- vld1.f32 d0[0], [r1]! @ Fill in d0[0]
- vld1.f32 d1[0], [r2]! @ Fill in d1[1]
-
- subs r4, r4, #1
-
- @ values
- vadd.f32 d0, d0, d1
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_residualloop_float
-
-.L_check_mainloop_float:
- cbz r3, .L_return_float
-
-
- @ load the current set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]! @ for current set
-
-.L_mainloop_float:
- @ calculate values for current set
- vadd.f32 q3, q0, q1 @ q3 = q0 + q1
-
- @ store the result for current set
- vst1.32 {d6,d7}, [r0]!
-
- subs r3, r3, #1
-
- @ load the next set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
-
- bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
-
-.L_return_float:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global add_vec2f_neon
- .thumb
- .thumb_func
-
-add_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t add_float(arm_vec2f_t * dst,
- @ arm_vec2f_t * src1,
- @ arm_vec2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec2
-
-.L_residualloop_vec2:
- @ process the residual items in the input array
- vld1.f32 d0, [r1]!
- vld1.f32 d1, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- vadd.f32 d0, d0, d1
-
- vst1.32 {d0}, [r0]!
- bgt .L_residualloop_vec2
-
-.L_check_mainloop_vec2:
- cbz r3, .L_return_vec2
-
- @ load the current set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
-
-.L_mainloop_vec2:
- @ calculate values for current set
- vadd.f32 q8, q0, q2
- vadd.f32 q9, q1, q3
-
- @ store the result for current set
- vst2.32 {d16,d17,d18,d19}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
-
- bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_return_vec2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global add_vec3f_neon
- .thumb
- .thumb_func
-add_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t add_float(arm_vec3f_t * dst,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec3
-
-.L_residualloop_vec3:
- @ process the residual items in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
-
- subs r4, r4, #1
-
- @ calculate values for
- vadd.f32 d0, d0, d1
- vadd.f32 d2, d2, d3
- vadd.f32 d4, d4, d5
-
- vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
-
- bgt .L_residualloop_vec3
-
-.L_check_mainloop_vec3:
- cbz r3, .L_return_vec3
-
- @ load current set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d18, d20, d22}, [r2]!
- vld3.32 {d19, d21, d23}, [r2]!
-
-.L_mainloop_vec3:
- @ calculate values for current set
- vadd.f32 q12, q0, q9
- vadd.f32 q13, q1, q10
- vadd.f32 q14, q2, q11
-
- @ store the result for current set
- vst3.32 {d24, d26, d28}, [r0]!
- vst3.32 {d25, d27, d29}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d18, d20, d22}, [r2]!
- vld3.32 {d19, d21, d23}, [r2]!
-
- bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_return_vec3:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global add_vec4f_neon
- .thumb
- .thumb_func
-add_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t add_float(arm_vec4f_t * dst,
- @ arm_vec4f_t * src1,
- @ arm_vec4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec4
-
-.L_residualloop_vec4:
- @ process the last few items left in the input array
- vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, V1.y, V1.z, V1.w };
- vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
- @ q1 = { V2.x, V2.y, V2.z, V2.w };
-
- subs r4, r4, #1
-
- @ calculate values
- vadd.f32 q0, q0, q1
-
- vst1.32 {d0, d1}, [r0]!
-
- bgt .L_residualloop_vec4
-
-.L_check_mainloop_vec4:
- cbz r3, .L_return_vec4
-
- @ load the current set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
-.L_mainloop_vec4:
- @ calculate values for the current set
- vadd.f32 q12, q0, q8
- vadd.f32 q13, q1, q9
- vadd.f32 q14, q2, q10
- vadd.f32 q15, q3, q11
-
- @ store the result for the current set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
- bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_return_vec4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_add_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_x_operation_x.h"
-
-extern arm_result_t add_float_c (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-//extern arm_result_t add_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-extern arm_result_t add_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-
-extern arm_result_t add_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-//extern arm_result_t add_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t add_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-
-extern arm_result_t add_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//extern arm_result_t add_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t add_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-extern arm_result_t add_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-//extern arm_result_t add_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-extern arm_result_t add_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) add_float_c;
- ftbl[ 1] = (arm_func_4args_t) add_float_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) add_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) add_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) add_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) add_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) add_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) add_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) add_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) add_vec4f_c;
- ftbl[10] = (arm_func_4args_t) add_vec4f_c; // using the c version in place of the assembly version
- ftbl[11] = (arm_func_4args_t) add_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_addc.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global addc_float_asm
- .thumb
- .thumb_func
-
-addc_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t addc_float(arm_vec2f_t * dst,
- @ arm_float_t * src, const arm_float_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndFloat
- mov r5, #0
-
-.LoopBeginFloat:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i]
- vmov s3, r2 @ Get cst into register s3
- vadd.f32 s10, s1, s3 @ s10 = src[i] + cst
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the result back into the main memory
- add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global addc_vec2f_asm
- .thumb
- .thumb_func
-
-addc_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t addc_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec2F
- mov r5, #0
-
-.LoopBeginVec2F:
-
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x and src[i].y
- vldr s2, [r6, #4]
- vldr s3, [r2, #0] @ Load cst->x and cst->y
- vldr s4, [r2, #4]
- vadd.f32 s10, s1, s3 @ s10 = src[i].x + cst->x
- vadd.f32 s11, s2, s4 @ s11 = src[i].y + cst->y
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global addc_vec3f_asm
- .thumb
- .thumb_func
-
-addc_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t addc_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec3F
- mov r5, #0
-
-.LoopBeginVec3F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
- vldr s5, [r2, #4]
- vldr s6, [r2, #8]
- vadd.f32 s10, s1, s4 @ s10 = src[i].x + cst->x
- vadd.f32 s11, s2, s5 @ s11 = src[i].y + cst->y
- vadd.f32 s12, s3, s6 @ s12 = src[i].z + cst->z
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global addc_vec4f_asm
- .thumb
- .thumb_func
-
-addc_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t addc_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec4F
- mov r5, #0
-
-.LoopBeginVec4F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r6, #12]
- vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
- vldr s6, [r2, #4]
- vldr s7, [r2, #8]
- vldr s8, [r2, #12]
- vadd.f32 s10, s1, s5 @ s10 = src[i].x + cst->x
- vadd.f32 s11, s2, s6 @ s11 = src[i].y + cst->y
- vadd.f32 s12, s3, s7 @ s12 = src[i].z + cst->z
- vadd.f32 s13, s4, s8 @ s13 = src[i].w + cst->w
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- vstr s13, [r7, #12]
- add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addc.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t addc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ] = src[ itr ] + cst;
- );
-}
-
-arm_result_t addc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x + cst->x;
- dst[ itr ].y = src[ itr ].y + cst->y;
- );
-}
-
-arm_result_t addc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x + cst->x;
- dst[ itr ].y = src[ itr ].y + cst->y;
- dst[ itr ].z = src[ itr ].z + cst->z;
- );
-}
-
-arm_result_t addc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x + cst->x;
- dst[ itr ].y = src[ itr ].y + cst->y;
- dst[ itr ].z = src[ itr ].z + cst->z;
- dst[ itr ].w = src[ itr ].w + cst->w;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addc.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
-
-arm_result_t addc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_FLOAT_NEON
- (
- n_dst = vaddq_f32( n_src , n_cst );
- ,
- n_tmp_src = vadd_f32( n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t addc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC2F_NEON
- (
- n_dst = vaddq_f32( n_src , n_cst );
- ,
- n_tmp_src = vadd_f32( n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t addc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC3F_NEON
- (
- n_dst1 = vaddq_f32( n_src1 , n_cst1 );
- n_dst2 = vaddq_f32( n_src2 , n_cst2 );
- n_dst3 = vaddq_f32( n_src3 , n_cst3 );
- ,
- n_tmp_src.val[0] = vadd_f32( n_tmp_src.val[0], n_tmp_cst.val[0] ); /* the X lane */
- n_tmp_src.val[1] = vadd_f32( n_tmp_src.val[1], n_tmp_cst.val[1] ); /* the Y lane */
- n_tmp_src.val[2] = vadd_f32( n_tmp_src.val[2], n_tmp_cst.val[2] ); /* the Z lane */
- );
-}
-
-arm_result_t addc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC4F_NEON
- (
- n_dst = vaddq_f32( n_src , n_cst );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addc_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xc_operation_x.h"
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) addc_float_c;
- ftbl[ 1] = (arm_func_4args_t) addc_float_asm;
- ftbl[ 2] = (arm_func_4args_t) addc_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) addc_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) addc_vec2f_asm;
- ftbl[ 5] = (arm_func_4args_t) addc_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) addc_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) addc_vec3f_asm;
- ftbl[ 8] = (arm_func_4args_t) addc_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) addc_vec4f_c;
- ftbl[10] = (arm_func_4args_t) addc_vec4f_asm;
- ftbl[11] = (arm_func_4args_t) addc_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_addmat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addmat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t addmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = src1[ itr ].c1.r1 + src2[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src1[ itr ].c1.r2 + src2[ itr ].c1.r2;
-
- dst[ itr ].c2.r1 = src1[ itr ].c2.r1 + src2[ itr ].c2.r1;
- dst[ itr ].c2.r2 = src1[ itr ].c2.r2 + src2[ itr ].c2.r2;
- );
-}
-
-arm_result_t addmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = src1[ itr ].c1.r1 + src2[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src1[ itr ].c1.r2 + src2[ itr ].c1.r2;
- dst[ itr ].c1.r3 = src1[ itr ].c1.r3 + src2[ itr ].c1.r3;
-
- dst[ itr ].c2.r1 = src1[ itr ].c2.r1 + src2[ itr ].c2.r1;
- dst[ itr ].c2.r2 = src1[ itr ].c2.r2 + src2[ itr ].c2.r2;
- dst[ itr ].c2.r3 = src1[ itr ].c2.r3 + src2[ itr ].c2.r3;
-
- dst[ itr ].c3.r1 = src1[ itr ].c3.r1 + src2[ itr ].c3.r1;
- dst[ itr ].c3.r2 = src1[ itr ].c3.r2 + src2[ itr ].c3.r2;
- dst[ itr ].c3.r3 = src1[ itr ].c3.r3 + src2[ itr ].c3.r3;
- );
-}
-
-arm_result_t addmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = src1[ itr ].c1.r1 + src2[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src1[ itr ].c1.r2 + src2[ itr ].c1.r2;
- dst[ itr ].c1.r3 = src1[ itr ].c1.r3 + src2[ itr ].c1.r3;
- dst[ itr ].c1.r4 = src1[ itr ].c1.r4 + src2[ itr ].c1.r4;
-
- dst[ itr ].c2.r1 = src1[ itr ].c2.r1 + src2[ itr ].c2.r1;
- dst[ itr ].c2.r2 = src1[ itr ].c2.r2 + src2[ itr ].c2.r2;
- dst[ itr ].c2.r3 = src1[ itr ].c2.r3 + src2[ itr ].c2.r3;
- dst[ itr ].c2.r4 = src1[ itr ].c2.r4 + src2[ itr ].c2.r4;
-
- dst[ itr ].c3.r1 = src1[ itr ].c3.r1 + src2[ itr ].c3.r1;
- dst[ itr ].c3.r2 = src1[ itr ].c3.r2 + src2[ itr ].c3.r2;
- dst[ itr ].c3.r3 = src1[ itr ].c3.r3 + src2[ itr ].c3.r3;
- dst[ itr ].c3.r4 = src1[ itr ].c3.r4 + src2[ itr ].c3.r4;
-
- dst[ itr ].c4.r1 = src1[ itr ].c4.r1 + src2[ itr ].c4.r1;
- dst[ itr ].c4.r2 = src1[ itr ].c4.r2 + src2[ itr ].c4.r2;
- dst[ itr ].c4.r3 = src1[ itr ].c4.r3 + src2[ itr ].c4.r3;
- dst[ itr ].c4.r4 = src1[ itr ].c4.r4 + src2[ itr ].c4.r4;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "NE10.h"
-
-arm_result_t addmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count)
-{
- return add_vec2f_neon( (arm_vec2f_t*)dst, (arm_vec2f_t*)src1, (arm_vec2f_t*)src2, count*2 );
-}
-
-arm_result_t addmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count)
-{
- return add_vec3f_neon( (arm_vec3f_t*)dst, (arm_vec3f_t*)src1, (arm_vec3f_t*)src2, count*3 );
-}
-
-arm_result_t addmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count)
-{
- return add_vec4f_neon( (arm_vec4f_t*)dst, (arm_vec4f_t*)src1, (arm_vec4f_t*)src2, count*4 );
-}
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addmat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xmat_operation_x.h"
-
-extern arm_result_t addmat_2x2f_c (arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t addmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-
-extern arm_result_t addmat_3x3f_c (arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t addmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-
-extern arm_result_t addmat_4x4f_c (arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t addmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) addmat_2x2f_c;
- ftbl[ 1] = (arm_func_4args_t) addmat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) addmat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_4args_t) addmat_3x3f_c;
- ftbl[ 4] = (arm_func_4args_t) addmat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) addmat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) addmat_4x4f_c;
- ftbl[ 7] = (arm_func_4args_t) addmat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) addmat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_cross.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_cross.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t cross_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = (src1[ itr ].y * src2[ itr ].z) - (src1[ itr ].z * src2[ itr ].y);
- dst[ itr ].y = (src1[ itr ].z * src2[ itr ].x) - (src1[ itr ].x * src2[ itr ].z);
- dst[ itr ].z = (src1[ itr ].x * src2[ itr ].y) - (src1[ itr ].y * src2[ itr ].x);
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_cross.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global cross_vec3f_neon
- .thumb
- .thumb_func
-cross_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t cross_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec3
-
-.L_residualloop_vec3:
- @ process the last few items left in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
-
- subs r4, r4, #1
-
- @ calculate values for
- vmul.f32 d20, d2, d5
- vmul.f32 d21, d4, d1
- vmul.f32 d22, d0, d3
-
- vmls.f32 d20, d3, d4
- vmls.f32 d21, d5, d0
- vmls.f32 d22, d1, d2
-
- vst3.32 {d20[0], d21[0], d22[0]}, [r0]!
-
- bgt .L_residualloop_vec3
-
-.L_check_mainloop_vec3:
- cbz r3, .L_return_vec3
-
- @ load current set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d26, d28, d30}, [r2]!
- vld3.32 {d27, d29, d31}, [r2]!
-
-.L_mainloop_vec3:
- @ calculate values for the 2nd/next (e.g. 3rd) set
- vmul.f32 q10, q1, q15
- vmul.f32 q11, q2, q13
- vmul.f32 q12, q0, q14
-
- vmls.f32 q10, q14, q2
- vmls.f32 q11, q15, q0
- vmls.f32 q12, q13, q1
-
- @ store the result for the 1st/next (e.g. 3rd) set
- vst3.32 {d20, d22, d24}, [r0]!
- vst3.32 {d21, d23, d25}, [r0]!
- subs r3, r3, #1
-
- @ load the next (e.g. 3rd) set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d26, d28, d30}, [r2]!
- vld3.32 {d27, d29, d31}, [r2]!
-
- bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_return_vec3:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_cross_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 1
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_cross_operation_x.h"
-
-extern arm_result_t cross_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//extern arm_result_t cross_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t cross_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) cross_vec3f_c;
- ftbl[ 1] = (arm_func_4args_t) cross_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) cross_vec3f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_detmat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_detmat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-#include "NE10_detmat.c.h"
-
-#include <assert.h>
-
-arm_result_t detmat_2x2f_c(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count)
-{
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ] = DET2x2( &src[ itr ] );
- );
-}
-
-arm_result_t detmat_3x3f_c(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count)
-{
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ] = DET3x3( &(src[ itr ]) );
-
- );
-}
-
-arm_result_t detmat_4x4f_c(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count)
-{
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ] = DET4x4( &src[ itr ] );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_detmat.c.h
- */
-
-#ifndef __NE10_DETMAT_C_H__
-#define __NE10_DETMAT_C_H__
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-static inline arm_float_t DET2x2( arm_mat2x2f_t * mat )
-{
- // 2x2 matrix layout
- // c1r1 c2r1
- // c1r2 c2r2
-
- return ( (mat->c1.r1 * mat->c2.r2)
- -(mat->c2.r1 * mat->c1.r2) );
-}
-
-static inline arm_float_t DET3x3( arm_mat3x3f_t * mat )
-{
- // 3x3 matrix layout
- // c1r1 c2r1 c3r1
- // c1r2 c2r2 c3r2
- // c1r3 c2r3 c3r3
-
- arm_mat2x2f_t subm11 = { {mat->c2.r2, mat->c2.r3}, {mat->c3.r2, mat->c3.r3} };
- arm_mat2x2f_t subm21 = { {mat->c1.r2, mat->c1.r3}, {mat->c3.r2, mat->c3.r3} };
- arm_mat2x2f_t subm31 = { {mat->c1.r2, mat->c1.r3}, {mat->c2.r2, mat->c2.r3} };
- return (mat->c1.r1*DET2x2( &subm11 ))
- - (mat->c2.r1*DET2x2( &subm21 ))
- + (mat->c3.r1*DET2x2( &subm31 ));
-}
-
-static inline arm_float_t DET4x4( arm_mat4x4f_t * mat )
-{
- // 4x4 matrix layout
- // c1r1 c2r1 c3r1 c4r1
- // c1r2 c2r2 c3r2 c4r2
- // c1r3 c2r3 c3r3 c4r3
- // c1r4 c2r4 c3r4 c4r4
-
- arm_mat3x3f_t subm11 = { {mat->c2.r2, mat->c2.r3, mat->c2.r4},
- {mat->c3.r2, mat->c3.r3, mat->c3.r4},
- {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
-
- arm_mat3x3f_t subm21 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
- {mat->c3.r2, mat->c3.r3, mat->c3.r4},
- {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
-
- arm_mat3x3f_t subm31 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
- {mat->c2.r2, mat->c2.r3, mat->c2.r4},
- {mat->c4.r2, mat->c4.r3, mat->c4.r4} };
-
- arm_mat3x3f_t subm41 = { {mat->c1.r2, mat->c1.r3, mat->c1.r4},
- {mat->c2.r2, mat->c2.r3, mat->c2.r4},
- {mat->c3.r2, mat->c3.r3, mat->c3.r4} };
-
- return (mat->c1.r1*DET3x3( &subm11 ))
- - (mat->c2.r1*DET3x3( &subm21 ))
- + (mat->c3.r1*DET3x3( &subm31 ))
- - (mat->c4.r1*DET3x3( &subm41 ));
-}
-
-
-
-
-#endif
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_detmat.neon.inc.s
-@
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ Get determinants of two 2x2 matrices in dRes
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_DET_2x2MATS_ARGS dA, dB, dC, dD, dRes
- vmul.f32 \dRes, \dA, \dD
- vmls.f32 \dRes, \dB, \dC
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ Get negated determinants of two 2x2 matrices in dRes
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_NEG_DET_2x2MATS_ARGS dA, dB, dC, dD, dRes
- GET_DET_2x2MATS_ARGS \dC, \dD, \dA, \dB, \dRes
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro used inside detmat_3x3f_neon() to load 3x3 matrices.
- @ Two 3x3 matrices are loaded from the source address
- @ into registers dst00-11. The corresponding qr00-qr05
- @ registers are then rearranged so the order of the data fits the
- @ code written in other macros below.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_3x3MATS_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, dst08, dst09, dst10, dst11, qr00, qr01, qr02, qr03, qr04, qr05, addr
-
- vld3.32 { \dst00, \dst02, \dst04 }, [\addr]!
- vld3.32 { \dst01[0], \dst03[0], \dst05[0] }, [\addr]!
- vld3.32 { \dst06, \dst08, \dst10 }, [\addr]!
- vld3.32 { \dst07[0], \dst09[0], \dst11[0] }, [\addr]!
-
- vtrn.32 \qr00, \qr03
- vtrn.32 \qr01, \qr04
- vtrn.32 \qr02, \qr05
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the determinant of two 3x3 matrices
- @ loaded using the above LOAD_3x3MATS_ARGS macro.
- @ The result is stored in the \res register.
- @ Registers \tmp2 and \tmp3 are used as scratch registers and will
- @ not be restored in this macro - the caller needs to resotre them
- @ if needed. Each of the aa-ii parameters can be a "d" register
- @ containing two floating-point values which correspond to the
- @ following reference matrix:
- @
- @ |aa dd gg|
- @ M = |bb ee hh|
- @ |cc ff ii|
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_DETERMINANT_of_3x3MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, res, tmp2, tmp3
- @ det = a*(ei-fh) - d*(bi-ch) + g*(bf-ec)
-
- vmul.f32 \res, \ee, \ii @ t1 = ei
- vmul.f32 \tmp2, \bb, \ii @ t2 = bi
- vmul.f32 \tmp3, \bb, \ff @ t3 = bf
-
- vmls.f32 \res, \ff, \hh @ t1 = ei-fh
- vmls.f32 \tmp2, \cc, \hh @ t2 = bi-ch
- vmls.f32 \tmp3, \ee, \cc @ t3 = bf-ec
-
- vmul.f32 \res, \aa, \res @ t1 = a*(ei-fh)
- vmls.f32 \res, \dd, \tmp2 @ t1 = a*(ei-fh) - d*(bi-ch)
- vmla.f32 \res, \gg, \tmp3 @ t1 = a*(ei-fh) - d*(bi-ch) + g*(bf-ec) = det(M1), det(M2)
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates nagated determinant of two 3x3 matrices
- @ The result is stored in \res
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_NEG_DET_3x3MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, res, tmp2, tmp3
- @ det = - a*(ei-fh) + d*(bi-ch) - g*(bf-ec)
- GET_DETERMINANT_of_3x3MATS_ARGS \dd, \ee, \ff, \aa, \bb, \cc, \gg, \hh, \ii, \res, \tmp2, \tmp3 @ Using the column exchange property
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro used inside detmat_4x4f_neon() to load 4x4 matrices.
- @ Two 4x4 matrices are loaded from the source address register \addr
- @ into registers dst00-15. The corresponding qr00-qr07
- @ registers are then rearranged so the order of the data fits the
- @ code written in other macros below.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_4x4MATS_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, dst08, dst09, dst10, dst11, dst12, dst13, dst14, dst15, qr00, qr01, qr02, qr03, qr04, qr05, qr06, qr07, addr
-
- vld4.32 { \dst00, \dst02, \dst04, \dst06 }, [\addr]!
- vld4.32 { \dst01, \dst03, \dst05, \dst07 }, [\addr]!
- vld4.32 { \dst08, \dst10, \dst12, \dst14 }, [\addr]!
- vld4.32 { \dst09, \dst11, \dst13, \dst15 }, [\addr]!
-
- vtrn.32 \qr00, \qr04
- vtrn.32 \qr01, \qr05
- vtrn.32 \qr02, \qr06
- vtrn.32 \qr03, \qr07
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the determinant of 4x4 matrices
- @ loaded using the above LOAD_4x4MATS_ARGS macro.
- @ The result is stored in the \res register.
- @ Registers \tmp2 to \tmp6 are used as scratch registers and will
- @ not be restored in this macro - the caller needs to resotre them
- @ if needed. Each of the aa-pp parameters can be a "d" register
- @ containing two floating-point values which correspond to the
- @ following reference matrix:
- @
- @ |aa ee ii mm|
- @ M = |bb ff jj nn|
- @ |cc gg kk oo|
- @ |dd hh ll pp|
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_DETERMINANT_of_4x4MATS_ARGS aa, bb, cc, dd, ee, ff, gg, hh, ii, jj, kk, ll, mm, nn, oo, pp, res, tmp2, tmp3, tmp4, tmp5, tmp6
-
- @ res = det(SubM11)
- GET_DETERMINANT_of_3x3MATS_ARGS \ff, \gg, \hh, \jj, \kk, \ll, \nn, \oo, \pp, \res, \tmp5, \tmp6
-
- @ tmp2 = det(SubM12)
- GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \jj, \kk, \ll, \nn, \oo, \pp, \tmp2, \tmp5, \tmp6
-
- @ tmp3 = det(SubM13)
- GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \ff, \gg, \hh, \nn, \oo, \pp, \tmp3, \tmp5, \tmp6
-
- @ tmp4 = det(SubM14)
- GET_DETERMINANT_of_3x3MATS_ARGS \bb, \cc, \dd, \ff, \gg, \hh, \jj, \kk, \ll, \tmp4, \tmp5, \tmp6
-
-
- vmul.f32 \res, \aa, \res
- vmls.f32 \res, \ee, \tmp2
- vmla.f32 \res, \ii, \tmp3
- vmls.f32 \res, \mm, \tmp4
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro used inside detmat_4x4f_neon() to load four 4x4 matrices
- @ from the memory location pointed to by the \addr register.
- @ The loaded matrices are stored in registers dst00-07 and
- @ finaklly rearranged using the corresponding registers qr00-qr03.
- @ qtmp1-qtmp4 are scratch registers which are not resotred in this
- @ maroc. The caller must restored them if needed.
- @ NOTE: Through out Ne10, matrices are loaded and stored in
- @ column major format.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_SINGLE_4x4MAT_ARGS dst00, dst01, dst02, dst03, dst04, dst05, dst06, dst07, qr00, qr01, qr02, qr03, qtmp1, qtmp2, qtmp3, qtmp4, addr
-
- vld4.32 { \dst00, \dst02, \dst04, \dst06 }, [\addr]!
- vld4.32 { \dst01, \dst03, \dst05, \dst07 }, [\addr]!
-
- vtrn.32 \qr00, \qtmp1
- vtrn.32 \qr01, \qtmp2
- vtrn.32 \qr02, \qtmp3
- vtrn.32 \qr03, \qtmp4
- .endm
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_detmat.neon.s
-@
-
-
-
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-.include "source/NE10_detmat.neon.inc.s"
-
-
-
- .align 4
- .global detmat_2x2f_neon
- .thumb
- .thumb_func
-
-detmat_2x2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t detmat_2x2f(arm_float_t * dst,
- @ arm_mat2x2f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 matrices
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
-
- cbz r2, .L_check_mat2x2
-
- @ We load four 2x2 matrices each time, calculate their
- @ determinants, store the results in the destination
- @ memory address, and move onto the next four.
-
- @ load the 1st set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- @ calculate values for current set
- vmul.f32 q15, q0, q3
- vmls.f32 q15, q1, q2
-
- ble .L_mainloopend_mat2x2
-
-.L_mainloop_mat2x2:
- @ store the result for current set
- vst1.32 {q15}, [r0]!
-
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- @ calculate values for next set
- vmul.f32 q15, q0, q3
- vmls.f32 q15, q1, q2
-
- bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_mat2x2:
- @ the last iteration for this call
- @ store the result for the last set
- vst1.32 {q15}, [r0]!
-
-.L_check_mat2x2:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat2x2
-
-.L_secondloop_mat2x2:
- @ process the last few items left in the input array
- vld1.32 {d0, d1}, [r1]! @ Load matrix [A]
-
- subs r3, r3, #1
-
- @ calculate det([A]) = |A|
- vrev64.32 d1, d1
- vmul.f32 d2, d0, d1
- vrev64.32 d2, d2
- vmls.f32 d2, d0, d1 @ At this point d2 = { -|A|, |A| }
-
- @ store the result which is in d2[1]
- vst1.32 {d2[1]}, [r0]!
-
- bgt .L_secondloop_mat2x2
-
-.L_return_mat2x2:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global detmat_3x3f_neon
- .thumb
- .thumb_func
-detmat_3x3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t detmat_3x3f(arm_float_t * dst,
- @ arm_mat3x3f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 matrices
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r2 = count % 4;
- sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat3x3
-
- @ We load two 3x3 matrices each time, calculate their
- @ determinants, store the results in the destination
- @ memory address, and move onto the next two.
-
- @ load the 1st set of values
- LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
- subs r2, r2, #2
-
- @ calculate values for the current set
- GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
-
- ble .L_mainloopend_mat3x3
-
-.L_mainloop_mat3x3:
- @ store the result for the current set
- vst1.32 {d22}, [r0]!
-
- @ load the next set of values
- LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d16, d17, d18, d19, d20, d21, q0, q1, q2, q8, q9, q10, r1
- subs r2, r2, #2
-
- @ calculate values for the next set
- GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d16, d18, d20, d1, d3, d5, d22, d24, d26
-
- bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_mat3x3:
- @ the last iteration for this call
- @ store the result for the last set
- vst1.32 {d22}, [r0]!
-
-.L_check_mat3x3:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat3x3
-
-.L_secondloop_mat3x3:
- @ process the last few items left in the input array
-
- @ load the next (e.g. 3rd) set of values
- vld3.32 { d0[0], d2[0], d4[0]}, [r1]!
- vld3.32 { d1[0], d3[0], d5[0]}, [r1]!
- vld3.32 {d16[0], d18[0], d20[0]}, [r1]!
-
- subs r3, r3, #1
-
- @ calculate values for the last (e.g. 3rd) set
- GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d1, d3, d5, d16, d18, d20, d22, d24, d26
-
- @ store the result for the last (e.g. 3rd) set
- vst1.32 {d22[0]}, [r0]!
-
- bgt .L_secondloop_mat3x3
-
-.L_return_mat3x3:
- @ return
-
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global detmat_4x4f_neon
- .thumb
- .thumb_func
-detmat_4x4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t detmat_float(arm_float_t * dst,
- @ arm_mat4x4f_t * src1,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat4x4
-
-
- @ We load two 4x4 matrices each time, calculate their
- @ determinants, store the results in the destination
- @ memory address, and move onto the next two.
-
- @ load the 1st set of values
- LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
- subs r2, r2, #2
-
- @ calculate values for the current set
- GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
-
- ble .L_mainloopend_mat4x4
-
-.L_mainloop_mat4x4:
- @ store the result for the current set
- vst1.32 {d24}, [r0]!
-
- @ load the next set of values
- LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d16, d17, d18, d19, d20, d21, d22, d23, q0, q1, q2, q3, q8, q9, q10, q11, r1
- subs r2, r2, #2
-
- @ calculate values for the next set
- GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d16, d18, d20, d22, d1, d3, d5, d7, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
-
- bgt .L_mainloop_mat4x4 @ loop if xx is > r2, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_mat4x4:
- @ the last iteration for this call
- @ store the result for the last set
- vst1.32 {d24}, [r0]!
-
-.L_check_mat4x4:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat4x4
-
-.L_secondloop_mat4x4:
- @ process the last few items left in the input array
- vld4.32 { d0[0], d2[0], d4[0], d6[0]}, [r1]!
- vld4.32 { d1[0], d3[0], d5[0], d7[0]}, [r1]!
- vld4.32 { d16[0], d18[0], d20[0], d22[0]}, [r1]!
- vld4.32 { d17[0], d19[0], d21[0], d23[0]}, [r1]!
-
-
-
- subs r3, r3, #1
-
- @ calculate values
- GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d1, d3, d5, d7, d16, d18, d20, d22, d17, d19, d21, d23, d24, d26, d28, d30, d25, d27
-
- @ store the results
- vst1.32 {d24[0]}, [r0]!
-
- bgt .L_secondloop_mat4x4
-
-.L_return_mat4x4:
- @ return
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_detmat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_detmat_operation_x.h"
-
-extern arm_result_t detmat_2x2f_c (arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t detmat_2x2f_neon(arm_float_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t detmat_3x3f_c (arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t detmat_3x3f_neon(arm_float_t * dst, arm_mat3x3f_t * src, unsigned int count);
-
-extern arm_result_t detmat_4x4f_c (arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t detmat_4x4f_neon(arm_float_t * dst, arm_mat4x4f_t * src, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) detmat_2x2f_c;
- ftbl[ 1] = (arm_func_3args_t) detmat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_3args_t) detmat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_3args_t) detmat_3x3f_c;
- ftbl[ 4] = (arm_func_3args_t) detmat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_3args_t) detmat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) detmat_4x4f_c;
- ftbl[ 7] = (arm_func_3args_t) detmat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_3args_t) detmat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_div.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global div_float_asm
- .thumb
- .thumb_func
-
-div_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t div_float(arm_vec2f_t * dst,
- @ arm_float_t * src1, const arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
- @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
- @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
- @ r3: int count
- @
- @ r3: loop counter
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r3, .LoopEndFloat
-
-.LoopBeginFloat:
- vldr s1, [r1] @ Load s1 = src1[i]
- add r1, r1, #4 @ move to the next entry
- vldr s2, [r2] @ Load s2 = src2[i]
- add r2, r2, #4 @ next entry
- vdiv.f32 s10, s1, s2 @ s10 = src1[i] / src2[i]
- vstr s10, [r0] @ Store the result back into the main memory
- add r0, r0, #4 @ next entry in the dst
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_div.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t div_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ] = src1[ itr ] / src2[ itr ];
- );
-}
-
-arm_result_t vdiv_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
- );
-}
-
-arm_result_t vdiv_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z / src2[ itr ].z;
- );
-}
-
-arm_result_t vdiv_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x / src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y / src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z / src2[ itr ].z;
- dst[ itr ].w = src1[ itr ].w / src2[ itr ].w;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_div.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global div_float_neon
- .thumb
- .thumb_func
-
-div_float_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t div_float(arm_float_t * dst,
- @ arm_float_t * src1,
- @ arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_float
-
-.L_residualloop_float:
- @ process the residual items in the input array
- vld1.f32 d0[0], [r1]! @ Fill in d0[0]
- vld1.f32 d1[0], [r2]! @ Fill in d1[1]
-
-
- subs r4, r4, #1
-
- @ values d0 = d0 / d1
- vrecpe.f32 d3, d1
- vrecps.f32 d1, d3, d1
- vmul.f32 d3, d1, d3
- vmul.f32 d0, d0, d3
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_residualloop_float
-
-.L_check_mainloop_float:
- cbz r3, .L_return_float
-
- @ load the current set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
-
-.L_mainloop_float:
- @ calculate values for the 2nd/next (e.g. 3rd) set
- vrecpe.f32 q3, q1
- vrecps.f32 q1, q3, q1
- vmul.f32 q3, q1, q3
- vmul.f32 q3, q0, q3
-
- @ store the result for the 1st/next (e.g. 3rd) set
- vst1.32 {d6,d7}, [r0]!
- subs r3, r3, #1
-
- @ load the next (e.g. 3rd) set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
-
- bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
-
-.L_return_float:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global vdiv_vec2f_neon
- .thumb
- .thumb_func
-
-vdiv_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t div_float(arm_vec2f_t * dst,
- @ arm_vec2f_t * src1,
- @ arm_vec2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec2
-
-.L_residualloop_vec2:
- @ process the residual items in the input array
- vld1.f32 d0, [r1]!
- vld1.f32 d1, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- @ d0 = d0 / d1
- vrecpe.f32 d4, d1
- vrecps.f32 d1, d4, d1
- vmul.f32 d4, d1, d4
- vmul.f32 d0, d0, d4
-
- vst1.32 {d0}, [r0]!
-
- bgt .L_residualloop_vec2
-
-.L_check_mainloop_vec2:
- cbz r3, .L_return_vec2
-
- @ load the current set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
-
-.L_mainloop_vec2:
- @ calculate values for current set
- @ q8 = q0 / q2
- vrecpe.f32 q8, q2
- vrecps.f32 q2, q8, q2
- vmul.f32 q8, q2, q8
- vmul.f32 q8, q0, q8
-
- @ q9 = q1 / q3
- vrecpe.f32 q9, q3
- vrecps.f32 q3, q9, q3
- vmul.f32 q9, q3, q9
- vmul.f32 q9, q1, q9
-
- @ store the result for current set
- vst2.32 {d16,d17,d18,d19}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
-
- bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_return_vec2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
- .align 4
- .global vdiv_vec3f_neon
- .thumb
- .thumb_func
-vdiv_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t div_float(arm_vec3f_t * dst,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec3
-
-.L_residualloop_vec3:
- @ process the residual items in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
-
- subs r4, r4, #1
-
- @ calculate values for
- vrecpe.f32 d18, d1
- vrecps.f32 d1 , d18, d1
- vmul.f32 d18, d1 , d18
- vmul.f32 d0 , d0 , d18
-
- vrecpe.f32 d20, d3
- vrecps.f32 d3 , d20, d3
- vmul.f32 d20, d3 , d20
- vmul.f32 d2 , d2 , d20
-
- vrecpe.f32 d22, d5
- vrecps.f32 d5 , d22, d5
- vmul.f32 d22, d5 , d22
- vmul.f32 d4 , d4 , d22
-
- vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
-
- bgt .L_residualloop_vec3
-
-.L_check_mainloop_vec3:
- cbz r3, .L_return_vec3
-
- @ load current set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d18, d20, d22}, [r2]!
- vld3.32 {d19, d21, d23}, [r2]!
-
-.L_mainloop_vec3:
- @ calculate values for current set
- @ q12 = q0 / q9
- vrecpe.f32 q12, q9
- vrecps.f32 q9 , q12, q9
- vmul.f32 q12, q9 , q12
- vmul.f32 q12, q0 , q12
-
- @ q13 = q1 / q10
- vrecpe.f32 q13, q10
- vrecps.f32 q10 , q13, q10
- vmul.f32 q13, q10 , q13
- vmul.f32 q13, q1 , q13
-
- @ q14 = q2 / q11
- vrecpe.f32 q14, q11
- vrecps.f32 q11 , q14, q11
- vmul.f32 q14, q11 , q14
- vmul.f32 q14, q2 , q14
-
- @ store the result for current set
- vst3.32 {d24, d26, d28}, [r0]!
- vst3.32 {d25, d27, d29}, [r0]!
- subs r3, r3, #1
-
- @ load next set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d18, d20, d22}, [r2]!
- vld3.32 {d19, d21, d23}, [r2]!
-
- bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_return_vec3:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global vdiv_vec4f_neon
- .thumb
- .thumb_func
-vdiv_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t div_float(arm_vec4f_t * dst,
- @ arm_vec4f_t * src1,
- @ arm_vec4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec4
-
-.L_residualloop_vec4:
- @ process the last few items left in the input array
- vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, V1.y, V1.z, V1.w };
- vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
- @ q1 = { V2.x, V2.y, V2.z, V2.w };
-
- subs r4, r4, #1
-
- @ calculate values
- @ q0 = q0 / q1
- vrecpe.f32 q2, q1
- vrecps.f32 q1 , q2, q1
- vmul.f32 q2, q1 , q2
- vmul.f32 q0 , q0 , q2
-
- vst1.32 {d0, d1}, [r0]!
-
- bgt .L_residualloop_vec4
-
-.L_check_mainloop_vec4:
- cbz r3, .L_return_vec4
-
- @ load the current set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
-.L_mainloop_vec4:
- @ calculate values for current set
- @ q12 = q0 / q8
- vrecpe.f32 q12, q8
- vrecps.f32 q8 , q12, q8
- vmul.f32 q12, q8 , q12
- vmul.f32 q12, q0 , q12
-
- @ q13 = q1 / q9
- vrecpe.f32 q13, q9
- vrecps.f32 q9 , q13, q9
- vmul.f32 q13, q9 , q13
- vmul.f32 q13, q1 , q13
-
- @ q14 = q2 / q10
- vrecpe.f32 q14, q10
- vrecps.f32 q10 , q14, q10
- vmul.f32 q14, q10 , q14
- vmul.f32 q14, q2 , q14
-
- @ q15 = q3 / q11
- vrecpe.f32 q15, q11
- vrecps.f32 q11 , q15, q11
- vmul.f32 q15, q11 , q15
- vmul.f32 q15, q3 , q15
-
- @ store the result for current set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
- bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_return_vec4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_div_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_x_operation_x_tolerant.h"
-
-extern arm_result_t div_float_c (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-//extern arm_result_t div_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-extern arm_result_t div_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-
-extern arm_result_t vdiv_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-//extern arm_result_t vdiv_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-
-extern arm_result_t vdiv_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//extern arm_result_t vdiv_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-extern arm_result_t vdiv_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-//extern arm_result_t vdiv_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-extern arm_result_t vdiv_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) div_float_c;
- ftbl[ 1] = (arm_func_4args_t) div_float_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) div_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) vdiv_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) vdiv_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) vdiv_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) vdiv_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) vdiv_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) vdiv_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) vdiv_vec4f_c;
- ftbl[10] = (arm_func_4args_t) vdiv_vec4f_c; // using the c version in place of the assembly version
- ftbl[11] = (arm_func_4args_t) vdiv_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_divc.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global divc_float_asm
- .thumb
- .thumb_func
-
-divc_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t divc_float(arm_vec2f_t * dst,
- @ arm_float_t * src, const arm_float_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndFloat
- mov r5, #0
-
-.LoopBeginFloat:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i]
- vmov s3, r2 @ Get cst into register s3
- vdiv.f32 s10, s1, s3 @ s10 = src[i] / cst
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the result back into the main memory
- add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global divc_vec2f_asm
- .thumb
- .thumb_func
-
-divc_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t divc_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec2F
- mov r5, #0
-
-.LoopBeginVec2F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x and src[i].y
- vldr s2, [r6, #4]
- vldr s3, [r2, #0] @ Load cst->x and cst->y
- vldr s4, [r2, #4]
- vdiv.f32 s10, s1, s3 @ s10 = src[i].x / cst->x
- vdiv.f32 s11, s2, s4 @ s11 = src[i].y / cst->y
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global divc_vec3f_asm
- .thumb
- .thumb_func
-
-divc_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t divc_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec3F
- mov r5, #0
-
-.LoopBeginVec3F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
- vldr s5, [r2, #4]
- vldr s6, [r2, #8]
- vdiv.f32 s10, s1, s4 @ s10 = src[i].x / cst->x
- vdiv.f32 s11, s2, s5 @ s11 = src[i].y / cst->y
- vdiv.f32 s12, s3, s6 @ s12 = src[i].z / cst->z
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global divc_vec4f_asm
- .thumb
- .thumb_func
-
-divc_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t divc_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec4F
- mov r5, #0
-
-.LoopBeginVec4F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r6, #12]
- vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
- vldr s6, [r2, #4]
- vldr s7, [r2, #8]
- vldr s8, [r2, #12]
- vdiv.f32 s10, s1, s5 @ s10 = src[i].x / cst->x
- vdiv.f32 s11, s2, s6 @ s11 = src[i].y / cst->y
- vdiv.f32 s12, s3, s7 @ s12 = src[i].z / cst->z
- vdiv.f32 s13, s4, s8 @ s13 = src[i].w / cst->w
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- vstr s13, [r7, #12]
- add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_divc.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t divc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ] = src[ itr ] / cst;
- );
-}
-
-arm_result_t divc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x / cst->x;
- dst[ itr ].y = src[ itr ].y / cst->y;
- );
-}
-
-arm_result_t divc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x / cst->x;
- dst[ itr ].y = src[ itr ].y / cst->y;
- dst[ itr ].z = src[ itr ].z / cst->z;
- );
-}
-
-arm_result_t divc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x / cst->x;
- dst[ itr ].y = src[ itr ].y / cst->y;
- dst[ itr ].z = src[ itr ].z / cst->z;
- dst[ itr ].w = src[ itr ].w / cst->w;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_divc.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
- #include <stdio.h>
- #include <stdlib.h>
-
-arm_result_t divc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- unsigned int ii = 0;
- float d[4];
- NE10_XC_OPERATION_FLOAT_NEON
- (
- /* a single division operation */
- float32x4_t rec = vrecpeq_f32( n_cst );
- rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
- rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
- n_dst = vmulq_f32( n_src , rec );
- ,
- /* a single division operation */
- float32x2_t rec = vrecpe_f32( n_tmp_cst );
- rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
- rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
- n_tmp_src = vmul_f32( n_tmp_src, rec );
- );
-}
-
-arm_result_t divc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC2F_NEON
- (
- /* a single division operation */
- float32x4_t rec = vrecpeq_f32( n_cst );
- rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
- rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
- n_dst = vmulq_f32( n_src , rec );
- ,
- /* a single division operation */
- float32x2_t rec = vrecpe_f32( n_tmp_cst );
- rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
- rec = vmul_f32(vrecps_f32(n_tmp_cst, rec), rec);
- n_tmp_src = vmul_f32( n_tmp_src, rec );
- );
-}
-
-arm_result_t divc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC3F_NEON
- (
- /* three division operations */
- float32x4_t rec = vrecpeq_f32( n_cst1 );
- rec = vmulq_f32(vrecpsq_f32(n_cst1, rec), rec);
- rec = vmulq_f32(vrecpsq_f32(n_cst1, rec), rec);
- n_dst1 = vmulq_f32( n_src1 , rec );
-
- rec = vrecpeq_f32( n_cst2 );
- rec = vmulq_f32(vrecpsq_f32(n_cst2, rec), rec);
- rec = vmulq_f32(vrecpsq_f32(n_cst2, rec), rec);
- n_dst2 = vmulq_f32( n_src2 , rec );
-
- rec = vrecpeq_f32( n_cst3 );
- rec = vmulq_f32(vrecpsq_f32(n_cst3, rec), rec);
- rec = vmulq_f32(vrecpsq_f32(n_cst3, rec), rec);
- n_dst3 = vmulq_f32( n_src3 , rec );
- ,
- /* three division operations */
- float32x2_t rec = vrecpe_f32( n_tmp_cst.val[0] );
- rec = vmul_f32(vrecps_f32(n_tmp_cst.val[0], rec), rec);
- rec = vmul_f32(vrecps_f32(n_tmp_cst.val[0], rec), rec);
- n_tmp_src.val[0] = vmul_f32( n_tmp_src.val[0] , rec );
-
- rec = vrecpe_f32( n_tmp_cst.val[1] );
- rec = vmul_f32(vrecps_f32(n_tmp_cst.val[1], rec), rec);
- rec = vmul_f32(vrecps_f32(n_tmp_cst.val[1], rec), rec);
- n_tmp_src.val[1] = vmul_f32( n_tmp_src.val[1] , rec );
-
- rec = vrecpe_f32( n_tmp_cst.val[2] );
- rec = vmul_f32(vrecps_f32(n_tmp_cst.val[2], rec), rec);
- rec = vmul_f32(vrecps_f32(n_tmp_cst.val[2], rec), rec);
- n_tmp_src.val[2] = vmul_f32( n_tmp_src.val[2] , rec );
- );
-}
-
-arm_result_t divc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC4F_NEON
- (
- /* a single division operation */
- float32x4_t rec = vrecpeq_f32( n_cst );
- rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
- rec = vmulq_f32(vrecpsq_f32(n_cst, rec), rec);
- n_dst = vmulq_f32( n_src , rec );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_divc_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xc_operation_x.h"
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) divc_float_c;
- ftbl[ 1] = (arm_func_4args_t) divc_float_asm;
- ftbl[ 2] = (arm_func_4args_t) divc_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) divc_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) divc_vec2f_asm;
- ftbl[ 5] = (arm_func_4args_t) divc_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) divc_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) divc_vec3f_asm;
- ftbl[ 8] = (arm_func_4args_t) divc_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) divc_vec4f_c;
- ftbl[10] = (arm_func_4args_t) divc_vec4f_asm;
- ftbl[11] = (arm_func_4args_t) divc_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_dot.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_dot.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t dot_vec2f_c(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
-{
- NE10_DOT_OPERATION_X_C
- (
- dst[ itr ] = src1[ itr ].x * src2[ itr ].x +
- src1[ itr ].y * src2[ itr ].y ;
- );
-}
-
-arm_result_t dot_vec3f_c(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_DOT_OPERATION_X_C
- (
- dst[ itr ] = src1[ itr ].x * src2[ itr ].x +
- src1[ itr ].y * src2[ itr ].y +
- src1[ itr ].z * src2[ itr ].z ;
- );
-}
-
-arm_result_t dot_vec4f_c(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
-{
- NE10_DOT_OPERATION_X_C
- (
- dst[ itr ] = src1[ itr ].x * src2[ itr ].x +
- src1[ itr ].y * src2[ itr ].y +
- src1[ itr ].z * src2[ itr ].z +
- src1[ itr ].w * src2[ itr ].w ;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_dot.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global dot_vec2f_neon
- .thumb
- .thumb_func
-
-dot_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t dot_float(arm_float_t * dst,
- @ arm_vec2f_t * src1,
- @ arm_vec2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec2
-
-.L_residualloop_vec2:
- @ process the residual items in the input array
- vld1.f32 d0, [r1]!
- vld1.f32 d1, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- vmul.f32 d0, d0, d1
- vpadd.f32 d0, d0
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_residualloop_vec2
-
-.L_check_mainloop_vec2:
- cbz r3, .L_return_vec2
-
- @ load the current set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
-
-.L_mainloop_vec2:
- @ calculate values for current set
- vmul.f32 q8, q0, q2
- vmla.f32 q8, q1, q3
-
- @ store the result for current set
- vst1.32 {d16,d17}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
-
- bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_return_vec2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global dot_vec3f_neon
- .thumb
- .thumb_func
-dot_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t dot_float(arm_float_t * dst,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec3
-
-.L_residualloop_vec3:
- @ process the residual items in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
-
- subs r4, r4, #1
-
- @ calculate values for
- vmul.f32 d0, d0, d1
- vmla.f32 d0, d2, d3
- vmla.f32 d0, d4, d5
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_residualloop_vec3
-
-.L_check_mainloop_vec3:
- cbz r3, .L_return_vec3
-
- @ load current set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d16, d18, d20}, [r2]!
- vld3.32 {d17, d19, d21}, [r2]!
-
-.L_mainloop_vec3:
- @ calculate values for current set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
-
- @ store the result for current set
- vst1.32 {d30, d31}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d16, d18, d20}, [r2]!
- vld3.32 {d17, d19, d21}, [r2]!
-
- bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_return_vec3:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global dot_vec4f_neon
- .thumb
- .thumb_func
-dot_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t dot_float(arm_float_t * dst,
- @ arm_vec4f_t * src1,
- @ arm_vec4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are residual that will be processed at the begin of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4; calculate the residual loop
- asr r3, r3, #2 @ r3 = count >> 2; calculate the main loop
-
- cbz r4, .L_check_mainloop_vec4
-
-.L_residualloop_vec4:
- @ process the residual items in the input array
- vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, V1.y, V1.z, V1.w };
- vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
- @ q1 = { V2.x, V2.y, V2.z, V2.w };
-
- subs r4, r4, #1
-
- @ calculate values
- vmul.f32 q0, q0, q1
- vadd.f32 d0, d0, d1
- vpadd.f32 d0, d0
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_residualloop_vec4
-
-.L_check_mainloop_vec4:
- cbz r3, .L_return_vec4
-
- @ load current set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
-.L_mainloop_vec4:
- @ calculate values for current set
- vmul.f32 q15, q0, q8
- vmla.f32 q15, q1, q9
- vmla.f32 q15, q2, q10
- vmla.f32 q15, q3, q11
-
- @ store the result for current set
- vst1.32 {d30, d31}, [r0]!
- subs r3, r3, #1
-
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
- bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_return_vec4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_dot_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_dot_operation_x.h"
-
-extern arm_result_t dot_vec2f_c (arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-//extern arm_result_t dot_vec2f_asm (arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t dot_vec2f_neon(arm_float_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-
-extern arm_result_t dot_vec3f_c (arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//extern arm_result_t dot_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t dot_vec3f_neon(arm_float_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-extern arm_result_t dot_vec4f_c (arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-//extern arm_result_t dot_vec4f_asm (arm_floaf_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-extern arm_result_t dot_vec4f_neon(arm_float_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) dot_vec2f_c;
- ftbl[ 1] = (arm_func_4args_t) dot_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) dot_vec2f_neon;
-
- ftbl[ 3] = (arm_func_4args_t) dot_vec3f_c;
- ftbl[ 4] = (arm_func_4args_t) dot_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) dot_vec3f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) dot_vec4f_c;
- ftbl[ 7] = (arm_func_4args_t) dot_vec4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) dot_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_identitymat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_identitymat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-arm_result_t identitymat_2x2f_c(arm_mat2x2f_t * dst, unsigned int count)
-{
- arm_mat2x2f_t *src = dst; // dummy placeholder
-
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ].c1.r1 = 1.0f;
- dst[ itr ].c1.r2 = 0.0f;
- dst[ itr ].c2.r1 = 0.0f;
- dst[ itr ].c2.r2 = 1.0f;
- );
-}
-
-arm_result_t identitymat_3x3f_c(arm_mat3x3f_t * dst, unsigned int count)
-{
- arm_mat3x3f_t *src = dst; // dummy placeholder
-
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ].c1.r1 = 1.0f;
- dst[ itr ].c1.r2 = 0.0f;
- dst[ itr ].c1.r3 = 0.0f;
-
- dst[ itr ].c2.r1 = 0.0f;
- dst[ itr ].c2.r2 = 1.0f;
- dst[ itr ].c2.r3 = 0.0f;
-
- dst[ itr ].c3.r1 = 0.0f;
- dst[ itr ].c3.r2 = 0.0f;
- dst[ itr ].c3.r3 = 1.0f;
- );
-}
-
-arm_result_t identitymat_4x4f_c(arm_mat4x4f_t * dst, unsigned int count)
-{
- arm_mat4x4f_t *src = dst; // dummy placeholder
-
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ].c1.r1 = 1.0f;
- dst[ itr ].c1.r2 = 0.0f;
- dst[ itr ].c1.r3 = 0.0f;
- dst[ itr ].c1.r4 = 0.0f;
-
- dst[ itr ].c2.r1 = 0.0f;
- dst[ itr ].c2.r2 = 1.0f;
- dst[ itr ].c2.r3 = 0.0f;
- dst[ itr ].c2.r4 = 0.0f;
-
- dst[ itr ].c3.r1 = 0.0f;
- dst[ itr ].c3.r2 = 0.0f;
- dst[ itr ].c3.r3 = 1.0f;
- dst[ itr ].c3.r4 = 0.0f;
-
- dst[ itr ].c4.r1 = 0.0f;
- dst[ itr ].c4.r2 = 0.0f;
- dst[ itr ].c4.r3 = 0.0f;
- dst[ itr ].c4.r4 = 1.0f;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_identitymat.neon.s
-@
-
-
-
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .balign 4
- .global identitymat_2x2f_neon
- .thumb
- .thumb_func
-
-identitymat_2x2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t identitymat_2x2f(arm_mat2x2f_t * dst,
- @ arm_mat2x2f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r2: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r2, r1, #3 @ r2 = count % 4;
- sub r1, r1, r2 @ count = count - r1; This is what's left to be processed after this loop
-
- vmov.f32 d2, 0.0
- vmov.f32 d3, 0.0
- vmov.f32 d0, 1.0
- vmov.f32 d1, 1.0
-
-
- vmov q3, q0
- vmov q2, q1
-
- cmp r1, #0
- beq .L_check_mat2x2
-
-.L_mainloop_mat2x2:
-
- subs r1, r1, #4
-
- vst4.32 {d0, d2, d4, d6}, [r0]!
- vst4.32 {d1, d3, d5, d7}, [r0]!
-
- bgt .L_mainloop_mat2x2 @ loop if r1 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_mat2x2:
-
-.L_check_mat2x2:
- @ check if anything left to process at the end of the input array
- cmp r2, #0
- ble .L_return_mat2x2
-
-.L_secondloop_mat2x2:
- @ process the last few items left in the input array
- vswp d18, d20
-
- subs r2, r2, #1
-
- vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]!
-
- bgt .L_secondloop_mat2x2
-
-.L_return_mat2x2:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global identitymat_3x3f_neon
- .thumb
- .thumb_func
-identitymat_3x3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t identitymat_3x3f(arm_mat3x3f_t * dst,
- @ arm_mat3x3f_t * src1,
- @ arm_mat3x3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r2: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r2, r1, #3 @ r1 = count % 4;
- sub r1, r1, r2 @ count = count - r1; This is what's left to be processed after this loop
-
- vmov.f32 d2, 0.0
- vmov.f32 d3, 0.0
- vmov.f32 d0, 1.0
- vmov.f32 d1, 1.0
-
- vmov q8 , q1
- vmov q9 , q1
- vmov q10, q1
- vmov q11, q1
- vmov q12, q1
- vmov q13, q1
-
- vtrn.32 d2, d0 @ d0 = {0.0f, 1.0f}
- vtrn.32 d1, d3 @ d1 = {1.0f, 0.0f}
-
- vmov d16, d1
- vmov d18, d0
- vmov d21, d1
- vmov d22, d1
- vmov d24, d0
- vmov d27, d1
-
- cmp r1, #0
- beq .L_check_mat3x3
-
-.L_mainloop_mat3x3:
-
- subs r1, r1, #2
-
- vst3.32 { d16 , d18 , d20 }, [r0]!
- vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
- vst3.32 { d22 , d24 , d26 }, [r0]!
- vst3.32 { d23[0], d25[0], d27[0]}, [r0]!
-
- bgt .L_mainloop_mat3x3 @ loop if r1 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_mat3x3:
-
-.L_check_mat3x3:
- @ check if anything left to process at the end of the input array
- cmp r2, #0
- ble .L_return_mat3x3
-
-.L_secondloop_mat3x3:
- @ process the last few items left in the input array
-
- subs r2, r2, #1
-
- vst3.32 { d16 , d18 , d20 }, [r0]!
- vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
-
- bgt .L_secondloop_mat3x3
-
-.L_return_mat3x3:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global identitymat_4x4f_neon
- .thumb
- .thumb_func
-identitymat_4x4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t identitymat_4x4f(arm_mat4x4f_t * dst,
- @ arm_mat4x4f_t * src1,
- @ arm_mat4x4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r2: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r2, r1, #3 @ r2 = count % 4;
- sub r1, r1, r2 @ count = count - r2; This is what's left to be processed after this loop
-
- vmov.f32 d2, 0.0
- vmov.f32 d3, 0.0
- vmov.f32 d0, 1.0
- vmov.f32 d1, 1.0
-
- vmov q8 , q1
- vmov q9 , q1
- vmov q10, q1
- vmov q11, q1
- vmov q12, q1
- vmov q13, q1
- vmov q14, q1
- vmov q15, q1
-
- vtrn.32 d2, d0 @ d0 = {0.0f, 1.0f}
- vtrn.32 d1, d3 @ d1 = {1.0f, 0.0f}
-
- vmov d16, d1
- vmov d18, d0
- vmov d21, d1
- vmov d23, d0
-
- vmov d24, d1
- vmov d26, d0
- vmov d29, d1
- vmov d31, d0
-
- cmp r1, #0
- beq .L_check_mat4x4
-
-.L_mainloop_mat4x4:
-
- subs r1, r1, #2
-
- vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
- vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
- vst4.32 { d24 , d26 , d28 , d30 }, [r0]!
- vst4.32 { d25 , d27 , d29 , d31 }, [r0]!
-
- bgt .L_mainloop_mat4x4 @ loop if r1 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_mat4x4:
-
-.L_check_mat4x4:
- @ check if anything left to process at the end of the input array
- cmp r2, #0
- ble .L_return_mat4x4
-
-.L_secondloop_mat4x4:
- @ process the last few items left in the input array
-
- subs r2, r2, #1
-
- vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
- vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
-
-
- bgt .L_secondloop_mat4x4
-
-.L_return_mat4x4:
- @ return
- mov r0, #0
- bx lr
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_identitymat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_identitymat_operation_x.h"
-
-extern arm_result_t identitymat_2x2f_c (arm_mat2x2f_t * dst, unsigned int count);
-extern arm_result_t identitymat_2x2f_neon(arm_mat2x2f_t * dst, unsigned int count);
-
-extern arm_result_t identitymat_3x3f_c (arm_mat3x3f_t * dst, unsigned int count);
-extern arm_result_t identitymat_3x3f_neon(arm_mat3x3f_t * dst, unsigned int count);
-
-extern arm_result_t identitymat_4x4f_c (arm_mat4x4f_t * dst, unsigned int count);
-extern arm_result_t identitymat_4x4f_neon(arm_mat4x4f_t * dst, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_2args_t) identitymat_2x2f_c;
- ftbl[ 1] = (arm_func_2args_t) identitymat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_2args_t) identitymat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_2args_t) identitymat_3x3f_c;
- ftbl[ 4] = (arm_func_2args_t) identitymat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_2args_t) identitymat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_2args_t) identitymat_4x4f_c;
- ftbl[ 7] = (arm_func_2args_t) identitymat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_2args_t) identitymat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_invmat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_invmat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-#include "NE10_detmat.c.h"
-#include <math.h>
-
-#include <assert.h>
-
-// This macro is used to determine floating point values that are small enough to be consiedered nearly zero
-#define IS_FLOAT_NEAR_ZERO(x) ( ((fabs(x))<(1e-12)) ? 1 : 0 )
-
-arm_result_t invmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count)
-{
- float det = 0.0f;
-
- NE10_DETMAT_OPERATION_X_C
- (
- det = DET2x2( &src[ itr ] );
-
- if ( 1 == IS_FLOAT_NEAR_ZERO(det) )
- {
- det = 1.0f;
- }
-
- det = 1.0f / det;
- dst[ itr ].c1.r1 = det * src[ itr ].c2.r2;
- dst[ itr ].c1.r2 = -1 * det * src[ itr ].c1.r2;
- dst[ itr ].c2.r1 = -1 * det * src[ itr ].c2.r1;
- dst[ itr ].c2.r2 = det * src[ itr ].c1.r1;
- );
-}
-
-arm_result_t invmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count)
-{
- #define aa (src[ itr ].c1.r1)
- #define bb (src[ itr ].c1.r2)
- #define cc (src[ itr ].c1.r3)
- #define dd (src[ itr ].c2.r1)
- #define ee (src[ itr ].c2.r2)
- #define ff (src[ itr ].c2.r3)
- #define gg (src[ itr ].c3.r1)
- #define hh (src[ itr ].c3.r2)
- #define ii (src[ itr ].c3.r3)
-
- float det = 0.0f;
- arm_mat2x2f_t A, B, C, D, E, F, G, H, I;
-
- NE10_DETMAT_OPERATION_X_C
- (
- det = DET3x3( &src[ itr ] );
-
- if ( 1 == IS_FLOAT_NEAR_ZERO(det) )
- {
- det = 1.0f;
- }
- det = 1.0f / det;
-
- // Calculate the coefficients
- createColumnMajorMatrix2x2( &A, ee, ff, hh, ii );
- createColumnMajorMatrix2x2( &B, dd, ff, gg, ii );
- createColumnMajorMatrix2x2( &C, dd, ee, gg, hh );
- createColumnMajorMatrix2x2( &D, bb, cc, hh, ii );
- createColumnMajorMatrix2x2( &E, aa, cc, gg, ii );
- createColumnMajorMatrix2x2( &F, aa, bb, gg, hh );
- createColumnMajorMatrix2x2( &G, bb, cc, ee, ff );
- createColumnMajorMatrix2x2( &H, aa, cc, dd, ff );
- createColumnMajorMatrix2x2( &I, aa, bb, dd, ee );
-
- dst[ itr ].c1.r1 = det * DET2x2( &A );
- dst[ itr ].c1.r2 = -1.0f * det * DET2x2( &D );
- dst[ itr ].c1.r3 = det * DET2x2( &G );
-
- dst[ itr ].c2.r1 = -1.0f * det * DET2x2( &B );
- dst[ itr ].c2.r2 = det * DET2x2( &E );
- dst[ itr ].c2.r3 = -1.0f * det * DET2x2( &H );
-
- dst[ itr ].c3.r1 = det * DET2x2( &C );
- dst[ itr ].c3.r2 = -1.0f * det * DET2x2( &F );
- dst[ itr ].c3.r3 = det * DET2x2( &I );
- );
-
- #undef aa
- #undef bb
- #undef cc
- #undef dd
- #undef ee
- #undef ff
- #undef gg
- #undef hh
- #undef ii
-}
-
-arm_result_t invmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count)
-{
- #define aa (src[ itr ].c1.r1)
- #define bb (src[ itr ].c1.r2)
- #define cc (src[ itr ].c1.r3)
- #define dd (src[ itr ].c1.r4)
-
- #define ee (src[ itr ].c2.r1)
- #define ff (src[ itr ].c2.r2)
- #define gg (src[ itr ].c2.r3)
- #define hh (src[ itr ].c2.r4)
-
- #define ii (src[ itr ].c3.r1)
- #define jj (src[ itr ].c3.r2)
- #define kk (src[ itr ].c3.r3)
- #define ll (src[ itr ].c3.r4)
-
- #define mm (src[ itr ].c4.r1)
- #define nn (src[ itr ].c4.r2)
- #define oo (src[ itr ].c4.r3)
- #define pp (src[ itr ].c4.r4)
-
- float det = 0.0f;
- arm_mat3x3f_t A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P;
-
- NE10_DETMAT_OPERATION_X_C
- (
- det = DET4x4( &src[ itr ] );
-
- if ( 1 == IS_FLOAT_NEAR_ZERO(det) )
- {
- det = 1.0f;
- }
- det = 1.0f / det;
-
- // Calculate the coefficients
- createColumnMajorMatrix3x3( &A, ff, gg, hh, jj, kk, ll, nn, oo, pp );
- createColumnMajorMatrix3x3( &B, ee, gg, hh, ii, kk, ll, mm, oo, pp );
- createColumnMajorMatrix3x3( &C, ee, ff, hh, ii, jj, ll, mm, nn, pp );
- createColumnMajorMatrix3x3( &D, ee, ff, gg, ii, jj, kk, mm, nn, oo );
- createColumnMajorMatrix3x3( &E, bb, cc, dd, jj, kk, ll, nn, oo, pp );
- createColumnMajorMatrix3x3( &F, aa, cc, dd, ii, kk, ll, mm, oo, pp );
- createColumnMajorMatrix3x3( &G, aa, bb, dd, ii, jj, ll, mm, nn, pp );
- createColumnMajorMatrix3x3( &H, aa, bb, cc, ii, jj, kk, mm, nn, oo );
- createColumnMajorMatrix3x3( &I, bb, cc, dd, ff, gg, hh, nn, oo, pp );
- createColumnMajorMatrix3x3( &J, aa, cc, dd, ee, gg, hh, mm, oo, pp );
- createColumnMajorMatrix3x3( &K, aa, bb, dd, ee, ff, hh, mm, nn, pp );
- createColumnMajorMatrix3x3( &L, aa, bb, cc, ee, ff, gg, mm, nn, oo );
- createColumnMajorMatrix3x3( &M, bb, cc, dd, ff, gg, hh, jj, kk, ll );
- createColumnMajorMatrix3x3( &N, aa, cc, dd, ee, gg, hh, ii, kk, ll );
- createColumnMajorMatrix3x3( &O, aa, bb, dd, ee, ff, hh, ii, jj, ll );
- createColumnMajorMatrix3x3( &P, aa, bb, cc, ee, ff, gg, ii, jj, kk );
-
-
- dst[ itr ].c1.r1 = det * DET3x3( &A );
- dst[ itr ].c1.r2 = -1.0f * det * DET3x3( &E );
- dst[ itr ].c1.r3 = det * DET3x3( &I );
- dst[ itr ].c1.r4 = -1.0f * det * DET3x3( &M );
-
- dst[ itr ].c2.r1 = -1.0f * det * DET3x3( &B );
- dst[ itr ].c2.r2 = det * DET3x3( &F );
- dst[ itr ].c2.r3 = -1.0f * det * DET3x3( &J );
- dst[ itr ].c2.r4 = det * DET3x3( &N );
-
- dst[ itr ].c3.r1 = det * DET3x3( &C );
- dst[ itr ].c3.r2 = -1.0f * det * DET3x3( &G );
- dst[ itr ].c3.r3 = det * DET3x3( &K );
- dst[ itr ].c3.r4 = -1.0f * det * DET3x3( &O );
-
- dst[ itr ].c4.r1 = -1.0f * det * DET3x3( &D );
- dst[ itr ].c4.r2 = det * DET3x3( &H );
- dst[ itr ].c4.r3 = -1.0f * det * DET3x3( &L );
- dst[ itr ].c4.r4 = det * DET3x3( &P );
- );
-
- #undef aa
- #undef bb
- #undef cc
- #undef dd
- #undef ee
- #undef ff
- #undef gg
- #undef hh
- #undef ii
- #undef jj
- #undef kk
- #undef ll
- #undef mm
- #undef nn
- #undef oo
- #undef pp
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_invmat.neon.s
-@
-
-
-
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-.include "source/NE10_detmat.neon.inc.s"
-
-
-
-
-CONST_FLOAT_ONE:
- .word 0x3f800000 @ This is the hex value for 1.0f in IEEE-754
- .word 0x3f800000
- .word 0x3f800000
- .word 0x3f800000
-
-CONST_FLOAT_1Em12:
- .word 0x2B8CBCCC @ This is the hex representation of 1.0e-12 in IEEE-754
- .word 0x2B8CBCCC @ Any determinant smaller than this value is
- .word 0x2B8CBCCC @ considered near zero and refused for
- .word 0x2B8CBCCC @ calculating the inverse of a matrix.
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the inverse of four 2x2 matrices.
- @ It reads in the matrices from registers q8-q11 and returns
- @ its results in registers q12-q15
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_INVERSE_2x2MATS
- @ get the determinant of these four matrices in q15
- vmul.f32 q15, q8, q11
- vmls.f32 q15, q9, q10
-
- @ compare them to find the ones that are too small and set those to 1.0f
- vacge.f32 q14, q15, q0 @ dst = q14
-
- vand.f32 q13, q14, q15 @ tmp = q13
- vbic.s32 q14, q1, q14 @ NOTE: This must be of type S32, the type F32 only negates the sign bits
- vorr.f32 q14, q14, q13 @ at this point q14 lanes that are too small are set to one and the rest are the determinants
-
- @ q15 = 1.0f / q14
- vrecpe.f32 q15, q14
- vrecps.f32 q14, q15, q14
- vmul.f32 q14, q14, q15
-
-
- @ now multiply all the entries with q14 = { 1/det(M1-M4) )
- vmul.f32 q12, q11, q14
- vmul.f32 q15, q8, q14
-
- vneg.f32 q14, q14
-
- vmul.f32 q13, q9, q14
- vmul.f32 q14, q10, q14
-
- .endm
-
-
-
-
- .align 4
- .global invmat_2x2f_neon
- .thumb
- .thumb_func
-
-invmat_2x2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t invmat_2x2f(arm_mat2x2f_t * dst,
- @ arm_mat2x2f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
-
- adr r4, CONST_FLOAT_1Em12
- vld1.32 {q0}, [r4]
- adr r4, CONST_FLOAT_ONE
- vld1.32 {q1}, [r4]
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat2x2
-
- @ We load four 2x2 matrices each time, inverse them using the
- @ provided macro above, and store the four resulting matrices
- @ back into the memory location pointed to by the first parameter dst (r0)
-
- @ load the 1st set of values
- vld4.32 {d16, d18, d20, d22}, [r1]!
- vld4.32 {d17, d19, d21, d23}, [r1]!
- subs r2, r2, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- GET_INVERSE_2x2MATS
-
- ble .L_mainloopend_mat2x2
-
-.L_mainloop_mat2x2:
- @ store the result for the current set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
-
- @ load the next set of values
- vld4.32 {d16, d18, d20, d22}, [r1]!
- vld4.32 {d17, d19, d21, d23}, [r1]!
- subs r2, r2, #4
-
- @ calculate values for the next set
- GET_INVERSE_2x2MATS
-
-
- bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_mat2x2:
- @ the last iteration for this call
- @ store the result for the last set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
-
-.L_check_mat2x2:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat2x2
-
-.L_secondloop_mat2x2:
- @ process the last few items left in the input array
- vld4.32 {d16[0], d18[0], d20[0], d22[0]}, [r1]!
-
- subs r3, r3, #1
-
- @ calculate values
- GET_INVERSE_2x2MATS
-
- @ store the results
- vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0]!
-
- bgt .L_secondloop_mat2x2
-
-.L_return_mat2x2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the inverse of two 3x3 matrices.
- @ It reads in the matrices from registers q0-q5 and returns
- @ its results in registers q10-q15.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_INVERSE_3x3MATS
- @ get the determinant of these two matrices in q15
- GET_DETERMINANT_of_3x3MATS_ARGS d0, d2, d4, d6, d8, d10, d1, d3, d5, d16, d9, d11 @ stores the results in d16
-
- @ compare them to find the ones that are too small and set those to 1.0f
- vacge.f32 d9, d16, d12 @ dst = d9 - the lanes that are too small are set to all (0)b
-
- vand.f32 d11, d9, d16 @ tmp = d11
- vbic.s32 d9, d14, d9 @ NOTE: This must be of type S32, the type F32 only negates the sign bits
- vorr.f32 d9, d9, d11 @ at this point d9 lanes that are too small are set to one and the rest are the determinants
-
- @ d16 = 1.0f / d9
- vrecpe.f32 d16, d9
- vrecps.f32 d9, d16, d9
- vmul.f32 d16, d9, d16
-
- vmov.f32 d17, d16 @ So q8 = { d16={1/det(M1), 1/det(M2)}, d17={1/det(M1), 1/det(M2)} }
-
- @ get the coefficients in q10 to q15
- GET_DET_2x2MATS_ARGS d8, d10, d3, d5, d20
- GET_NEG_DET_2x2MATS_ARGS d6, d10, d1, d5, d26
- GET_DET_2x2MATS_ARGS d6, d8, d1, d3, d21
-
- GET_NEG_DET_2x2MATS_ARGS d2, d4, d3, d5, d22
- GET_DET_2x2MATS_ARGS d0, d4, d1, d5, d28
- GET_NEG_DET_2x2MATS_ARGS d0, d2, d1, d3, d23
-
- GET_DET_2x2MATS_ARGS d2, d4, d8, d10, d24
- GET_NEG_DET_2x2MATS_ARGS d0, d4, d6, d10, d30
- GET_DET_2x2MATS_ARGS d0, d2, d6, d8, d25
-
-
-
- @ now multiply all the entries with q8 = { d16={1/det(M1), 1/det(M2)}, d17={1/det(M1), 1/det(M2)} }
-
- vmul.f32 q10, q10, q8
- vmul.f32 q11, q11, q8
- vmul.f32 q12, q12, q8
-
- vmul.f32 q13, q13, q8
- vmul.f32 q14, q14, q8
- vmul.f32 q15, q15, q8
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro stores two 3x3 matrices returned by the above macro
- @ GET_INVERSE_3x3MATS from registers q10-q15 and into the memory
- @ address pointed to by the register r0 (dst)
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_3x3INVMATS
- @ rearrange the results for use in a "vst3" instruction...
- vtrn.32 q10, q13
- vtrn.32 q11, q14
- vtrn.32 q12, q15
-
- vst3.32 { d20 , d22 , d24 }, [r0]!
- vst3.32 { d21[0], d23[0], d25[0]}, [r0]!
- vst3.32 { d26 , d28 , d30 }, [r0]!
- vst3.32 { d27[0], d29[0], d31[0]}, [r0]!
- .endm
-
-
-
-
- .align 4
- .global invmat_3x3f_neon
- .thumb
- .thumb_func
-invmat_3x3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t invmat_3x3f(arm_mat3x3f_t * dst,
- @ arm_mat3x3f_t * src1,
- @ arm_mat3x3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- vpush {q4, q5, q6, q7}
-
- adr r4, CONST_FLOAT_1Em12
- vld1.32 {q6}, [r4]
- adr r4, CONST_FLOAT_ONE
- vld1.32 {q7}, [r4]
-
- and r3, r2, #3 @ r2 = count % 4;
- sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat3x3
-
- @ We load two 3x3 matrices each time, inverse them using the
- @ provided macro above, and store the two resulting matrices
- @ back into the memory location pointed to by the first parameter dst (r0)
-
- @ load the 1st set of values
- LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, q0, q1, q2, q3, q4, q5, r1
-
- subs r2, r2, #2 @ 2 for this set
-
- @ calculate values for the 1st set
- GET_INVERSE_3x3MATS
-
-
- ble .L_mainloopend_mat3x3
-
-.L_mainloop_mat3x3:
- @ store the result for the current set
- STORE_3x3INVMATS
-
- @ load the next set of values
- LOAD_3x3MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, q0, q1, q2, q3, q4, q5, r1
- subs r2, r2, #2
-
- @ calculate values for the next set
- GET_INVERSE_3x3MATS
-
- bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_mat3x3:
- @ the last iteration for this call
- @ store the result for the last set
- STORE_3x3INVMATS
-
-
-.L_check_mat3x3:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat3x3
-
-.L_secondloop_mat3x3:
- @ process the last few items left in the input array
- @ load the next (e.g. 3rd) set of values
- vld3.32 { d0, d2, d4 }, [r1]!
- vld3.32 { d1[0], d3[0], d5[0] }, [r1]!
-
- vtrn.32 q0, q3
- vtrn.32 q1, q4
- vtrn.32 q2, q5
-
- subs r3, r3, #1
-
- @ calculate values for the last (e.g. 3rd) set
- GET_INVERSE_3x3MATS
-
- @ store the result for the last (e.g. 3rd) set
- vtrn.32 q10, q13
- vtrn.32 q11, q14
- vtrn.32 q12, q15
-
- vst3.32 { d20 , d22 , d24 }, [r0]!
- vst3.32 { d21[0], d23[0], d25[0]}, [r0]!
-
- bgt .L_secondloop_mat3x3
-
-.L_return_mat3x3:
- @ return
- vpop {q4, q5, q6, q7}
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the inverse of two 4x4 matrices.
- @ It reads in the matrices from registers q0-q7 and returns
- @ its results in registers q8-q15.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro GET_INVERSE_4x4MATS
- vld1.32 {q10}, [r4]
- vld1.32 {q11}, [r5]
-
- @ get the determinant of these two matrices in q15
- GET_DETERMINANT_of_4x4MATS_ARGS d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15, d30, d28, d26, d31, d29, d27
-
- @ compare them to find the ones that are too small and set those to 1.0f
- vacge.f32 d24, d30, d20 @ dst = d24
-
- vand.f32 d25, d24, d30 @ tmp = d25
- vbic.s32 d24, d22, d24 @ NOTE: The instruction here must be of type S32, the type F32 only negates the sign bits
- vorr.f32 d24, d24, d25 @ at this point all d24 lanes that are too small are set to one and the rest are the determinants
-
- @ d30 = 1.0f (=q1) / d24
- vrecpe.f32 d30, d24
- vrecps.f32 d24, d30, d24
- vmul.f32 d30, d24, d30
-
- vmov.f32 d31, d30 @ So q15 = { d30={1/det(M1), 1/det(M2)}, d31={1/det(M1), 1/det(M2)} }
-
-
- @ get the coefficients
- GET_DETERMINANT_of_3x3MATS_ARGS d0 , d4 , d6 , d8 , d12, d14, d1 , d5 , d7 , d18, d20, d22
- GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d4 , d8 , d10, d12, d1 , d3 , d5 , d19, d20, d22
-
- GET_DETERMINANT_of_3x3MATS_ARGS d10, d12, d14, d3 , d5 , d7 , d11, d13, d15, d16, d20, d22
- GET_NEG_DET_3x3MATS_ARGS d8 , d12, d14, d1 , d5 , d7 , d9 , d13, d15, d24, d20, d22
- GET_DETERMINANT_of_3x3MATS_ARGS d8 , d10, d14, d1 , d3 , d7 , d9 , d11, d15, d17, d20, d22
- GET_NEG_DET_3x3MATS_ARGS d8 , d10, d12, d1 , d3 , d5 , d9 , d11, d13, d25, d20, d22
-
- vpush {d16, d17, d18, d19}
-
- GET_NEG_DET_3x3MATS_ARGS d2 , d4 , d6 , d3 , d5 , d7 , d11, d13, d15, d18, d16, d17
- GET_DETERMINANT_of_3x3MATS_ARGS d0 , d4 , d6 , d1 , d5 , d7 , d9 , d13, d15, d26, d16, d17
- GET_NEG_DET_3x3MATS_ARGS d0 , d2 , d6 , d1 , d3 , d7 , d9 , d11, d15, d19, d16, d17
- GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d4 , d1 , d3 , d5 , d9 , d11, d13, d27, d16, d17
-
- GET_DETERMINANT_of_3x3MATS_ARGS d2 , d4 , d6 , d10, d12, d14, d11, d13, d15, d20, d16, d17
- GET_NEG_DET_3x3MATS_ARGS d0 , d4 , d6 , d8 , d12, d14, d9 , d13, d15, d28, d16, d17
- GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d6 , d8 , d10, d14, d9 , d11, d15, d21, d16, d17
- GET_NEG_DET_3x3MATS_ARGS d0 , d2 , d4 , d8 , d10, d12, d9 , d11, d13, d29, d16, d17
-
- GET_NEG_DET_3x3MATS_ARGS d2 , d4 , d6 , d10, d12, d14, d3 , d5 , d7 , d22, d16, d17
- @@ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d4 , d6 , d8 , d12, d14, d1 , d5 , d7 , d30, d16, d17 @ This is moved to the top of this section as q15 must remain unchanged
- GET_NEG_DET_3x3MATS_ARGS d0 , d2 , d6 , d8 , d10, d14, d1 , d3 , d7 , d23, d16, d17
- @@ GET_DETERMINANT_of_3x3MATS_ARGS d0 , d2 , d4 , d8 , d10, d12, d1 , d3 , d5 , d31, d16, d17 @ This is moved to the top of this section as q15 must remain unchanged
-
- vpop {d16, d17}
-
- @ now multiply all the entries with q15 = { d30={1/det(M1), 1/det(M2)}, d31={1/det(M1), 1/det(M2)} }
-
- vmul.f32 q11, q11, q15
- vmul.f32 q10, q10, q15
- vmul.f32 q9, q9, q15
- vmul.f32 q8, q8, q15
-
- vpop {d0, d1}
-
- vmul.f32 q12, q12, q15
- vmul.f32 q13, q13, q15
- vmul.f32 q14, q14, q15
- vmul.f32 q15, q0, q15
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro stores two 4x4 matrices returned by the above macro
- @ GET_INVERSE_4x4MATS from registers q8-q15 and into the memory
- @ address pointed to by the register r0 (dst)
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_4x4INVMATS
- @ rearrange the results for use in a "vst4" instruction...
- vtrn.32 q8, q12
- vtrn.32 q9, q13
- vtrn.32 q10, q14
- vtrn.32 q11, q15
-
- vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
- vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
- vst4.32 { d24 , d26 , d28 , d30 }, [r0]!
- vst4.32 { d25 , d27 , d29 , d31 }, [r0]!
- .endm
-
-
-
-
- .align 4
- .global invmat_4x4f_neon
- .thumb
- .thumb_func
-invmat_4x4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t invmat_4x4f(arm_mat4x4f_t * dst,
- @ arm_mat4x4f_t * src1,
- @ arm_mat4x4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5}
- vpush {q4, q5, q6, q7}
-
- adr r4, CONST_FLOAT_1Em12
- adr r5, CONST_FLOAT_ONE
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat4x4
-
- @ We load two 4x4 matrices each time, inverse them using the
- @ provided macro above, and store the two resulting matrices
- @ back into the memory location pointed to by the first parameter dst (r0)
-
- @ load the 1st set of values
- LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, q0, q1, q2, q3, q4, q5, q6, q7, r1
- subs r2, r2, #2 @ two for the first set
-
- @ calculate values for the 1st set
- GET_INVERSE_4x4MATS
-
- ble .L_mainloopend_mat4x4
-
-.L_mainloop_mat4x4:
- @ store the result for the 1st/next (e.g. 3rd) set
- STORE_4x4INVMATS
-
- @ load the next (e.g. 3rd) set of values
- LOAD_4x4MATS_ARGS d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, q0, q1, q2, q3, q4, q5, q6, q7, r1
- subs r2, r2, #2
-
- @ calculate values for the 2nd/next (e.g. 3rd) set
- GET_INVERSE_4x4MATS
-
-
- bgt .L_mainloop_mat4x4 @ loop if r2 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_mat4x4:
- @ the last iteration for this call
- @ store the result for the last set
- STORE_4x4INVMATS
-
-.L_check_mat4x4:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat4x4
-
-.L_secondloop_mat4x4:
- @ process the last few items left in the input array
- vld4.32 { d0, d2, d4, d6 }, [r1]!
- vld4.32 { d1, d3, d5, d7 }, [r1]!
-
- vtrn.32 q0, q4
- vtrn.32 q1, q5
- vtrn.32 q2, q6
- vtrn.32 q3, q7
-
- subs r3, r3, #1
- @ calculate values
- GET_INVERSE_4x4MATS
-
- @ store the results
- vtrn.32 q8, q12
- vtrn.32 q9, q13
- vtrn.32 q10, q14
- vtrn.32 q11, q15
-
- vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
- vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
-
-
- bgt .L_secondloop_mat4x4
-
-.L_return_mat4x4:
- @ return
- vpop {q4, q5, q6, q7}
- pop {r4, r5}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_invmat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_invmat_operation_x.h"
-
-extern arm_result_t invmat_2x2f_c (arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t invmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t invmat_3x3f_c (arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t invmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-
-extern arm_result_t invmat_4x4f_c (arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t invmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) invmat_2x2f_c;
- ftbl[ 1] = (arm_func_3args_t) invmat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_3args_t) invmat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_3args_t) invmat_3x3f_c;
- ftbl[ 4] = (arm_func_3args_t) invmat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_3args_t) invmat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) invmat_4x4f_c;
- ftbl[ 7] = (arm_func_3args_t) invmat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_3args_t) invmat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_len.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global len_vec2f_asm
- .thumb
- .thumb_func
-
-len_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t len_vec2f(arm_float_t * dst,
- @ arm_vec2f_t * src, unsigned int count)
- @
- @ r0: *dst and current destination item's address
- @ r1: *src and current source item's address
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndVec2F
- add r0, r0, r2, lsl #2
- add r1, r1, r2, lsl #3 @ r1 = r1 + count * 8
-
-.LoopBeginVec2F:
- vldmdb r1!, {s10-s11}
- vmul.f32 s14, s10, s10 @ s14 = x*x
- vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
- vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
- vstmdb r0!, {s15} @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
- subs r2, r2, #1 @ decrement the loop counter
- bne .LoopBeginVec2F @ loop if r4 is still positive or zero
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
-
-
-
-
- .balign 4
- .global len_vec3f_asm
- .thumb
- .thumb_func
-
-len_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t len_vec3f(arm_float_t * dst,
- @ arm_vec3f_t * src, unsigned int count)
- @
- @ r0: *dst and current destination item's address
- @ r1: *src and current source item's address
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndVec3F
- add r0, r0, r2, lsl #2
- add r1, r1, r2, lsl #3 @ ...
- add r1, r1, r2, lsl #2 @ r1 = r1 + count * 12
-
-.LoopBeginVec3F:
- vldmdb r1!, {s10-s12}
- vmul.f32 s14, s10, s10 @ s14 = x*x
- vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
- vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
- vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
- vstmdb r0!, {s15} @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
- subs r2, r2, #1 @ decrement the loop counter
- bne .LoopBeginVec3F @ loop if r4 is still positive or zero
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
-
-
-
-
- .balign 4
- .global len_vec4f_asm
- .thumb
- .thumb_func
-
-len_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t len_vec4f(arm_float_t * dst,
- @ arm_vec4f_t * src, unsigned int count)
- @
- @ r0: *dst and current destination item's address
- @ r1: *src and current source item's address
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndVec4F
- add r0, r0, r2, lsl #2
- add r1, r1, r2, lsl #4 @ r1 = r1 + count * 16
-
-.LoopBeginVec4F:
- vldmdb r1!, {s10-s13}
- vmul.f32 s14, s10, s10 @ s14 = x*x
- vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
- vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
- vmla.f32 s14, s13, s13 @ s14 = x*x + y*y + z*z + w*w
- vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
- vstmdb r0!, {s15} @ store s15 in dst[ i ]=s15 and move dst to the next entry (4 bytes)
- subs r2, r2, #1 @ decrement the loop counter
- bne .LoopBeginVec4F @ loop if r4 is still positive or zero
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_len.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-#include <math.h>
-
-arm_result_t len_vec2f_c(arm_float_t * dst, arm_vec2f_t * src, unsigned int count)
-{
- NE10_LEN_OPERATION_X_C
- (
- dst[ itr ] = sqrt( src[ itr ].x * src[ itr ].x +
- src[ itr ].y * src[ itr ].y ) ;
- );
-}
-
-arm_result_t len_vec3f_c(arm_float_t * dst, arm_vec3f_t * src, unsigned int count)
-{
- NE10_LEN_OPERATION_X_C
- (
- dst[ itr ] = sqrt( src[ itr ].x * src[ itr ].x +
- src[ itr ].y * src[ itr ].y +
- src[ itr ].z * src[ itr ].z );
- );
-}
-
-arm_result_t len_vec4f_c(arm_float_t * dst, arm_vec4f_t * src, unsigned int count)
-{
- NE10_LEN_OPERATION_X_C
- (
- dst[ itr ] = sqrt( src[ itr ].x * src[ itr ].x +
- src[ itr ].y * src[ itr ].y +
- src[ itr ].z * src[ itr ].z +
- src[ itr ].w * src[ itr ].w );
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_len.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .balign 4
- .global len_vec2f_neon
- .thumb
- .thumb_func
-
-len_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t len_vec2f(arm_float_t * dst,
- @ arm_vec2f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
- cbz r2, .L_check_vec2
-
-
- @ load values for the first iteration
- vld2.32 {q0-q1}, [r1]!
- subs r2, r2, #4
-
- @ calculate sum of square of the components
- vmul.f32 q2, q0, q0
- vmla.f32 q2, q1, q1
-
- ble .L_mainloopend_vec2
-
-.L_mainloop_vec2:
-
- @ load the next set of values
- vld2.32 {q0-q1}, [r1]!
- subs r2, r2, #4
-
- @ get SQRT of the last vector while loading a new vector
- vrsqrte.f32 q3, q2
- vmul.f32 q4, q2, q3
- vrsqrts.f32 q4, q4, q3
- vmul.f32 q4, q3, q4
-
- vmul.f32 q2, q2, q4
-
- vst1.32 {q2}, [r0]!
-
- @ calculate sum of square of the components
-
- vmul.f32 q2, q0, q0
- vmla.f32 q2, q1, q1
-
- bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_vec2:
- @ the last iteration for this call
-
- @ get SQRT of the last vector
- vrsqrte.f32 q3, q2
- vmul.f32 q4, q2, q3
- vrsqrts.f32 q4, q4, q3
- vmul.f32 q4, q3, q4
-
- vmul.f32 q2, q2, q4
-
- vst1.32 {q2}, [r0]!
-
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec2
-
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
-
- subs r3, r3, #1
-
- vmul.f32 d0, d0, d0 @ d0= { V.x^2, V.y^2 };
- vpadd.f32 d0, d0, d0 @ d0= { V.x^2 + (V.y^2), V.y^2 + (V.x^2) }; // d0 = d0 + (d1^2)
-
- @ get SQRT of the vector
- vrsqrte.f32 d2, d0
- vmul.f32 d1, d0, d2
- vrsqrts.f32 d1, d1, d2
- vmul.f32 d1, d2, d1
-
- vmul.f32 d0, d0, d1
-
- vst1.32 d0[0], [r0]!
-
- bgt .L_secondloop_vec2
-
-.L_return_vec2:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global len_vec3f_neon
- .thumb
- .thumb_func
-len_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t len_vec3f(arm_float_t * dst,
- @ arm_vec3f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
- cbz r2, .L_check_vec3
-
-
- @ load values for the first iteration
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- subs r2, r2, #4
-
- @ calculate sum of square of the components
- vmul.f32 q5, q0, q0
- vmla.f32 q5, q1, q1
- vmla.f32 q5, q2, q2
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ load the next set of values
- vld3.32 {d0,d2,d4}, [r1]!
- vld3.32 {d1,d3,d5}, [r1]!
- subs r2, r2, #4
-
- @ get SQRT of the last vector while loading a new vector
- vrsqrte.f32 q3, q5
- vmul.f32 q4, q5, q3
- vrsqrts.f32 q4, q4, q3
- vmul.f32 q4, q3, q4
-
- vmul.f32 q5, q5, q4
-
- vst1.32 {q5}, [r0]!
-
- @ calculate sum of square of the components
- vmul.f32 q5, q0, q0
- vmla.f32 q5, q1, q1
- vmla.f32 q5, q2, q2
-
- bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
-
- @ get SQRT of the last vector
- vrsqrte.f32 q3, q5
- vmul.f32 q4, q5, q3
- vrsqrts.f32 q4, q4, q3
- vmul.f32 q4, q3, q4
-
- vmul.f32 q5, q5, q4
-
- vst1.32 {q5}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, -, -, - };
- @ q1 = { V.y, -, -, - };
- @ q2 = { V.z, -, -, - };
- subs r3, r3, #1
-
- vmul.f32 q0, q0, q0 @ V.x^2
- vmla.f32 q0, q1, q1 @ V.x^2 + V.y^2
- vmla.f32 q0, q2, q2 @ V.x^2 + V.y^2 + V.z^2
-
- @ get SQRT of the vector
- vrsqrte.f32 q2, q0
- vmul.f32 q1, q0, q2
- vrsqrts.f32 q1, q1, q2
- vmul.f32 q1, q2, q1
-
- vmul.f32 q0, q0, q1
-
- vst1.32 d0[0], [r0]!
-
- bgt .L_secondloop_vec3
-
-.L_return_vec3:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global len_vec4f_neon
- .thumb
- .thumb_func
-len_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t len_vec4f(arm_float_t * dst,
- @ arm_vec4f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
- cbz r2, .L_check_vec4
-
-
- @ load values for the first iteration
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- @ calculate sum of square of the components
- vmul.f32 q5, q0, q0
- vmla.f32 q5, q1, q1
- vmla.f32 q5, q2, q2
- vmla.f32 q5, q3, q3
-
- ble .L_mainloopend_vec4
-
-.L_mainloop_vec4:
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- @ get SQRT of the last vector while loading a new vector
- vrsqrte.f32 q6, q5
- vmul.f32 q4, q5, q6
- vrsqrts.f32 q4, q4, q6
- vmul.f32 q4, q6, q4
-
- vmul.f32 q5, q5, q4
-
- vst1.32 {q5}, [r0]!
-
- @ calculate sum of square of the components
- vmul.f32 q5, q0, q0
- vmla.f32 q5, q1, q1
- vmla.f32 q5, q2, q2
- vmla.f32 q5, q3, q3
-
- bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
-
- @ get SQRT of the last vector
- vrsqrte.f32 q6, q5
- vmul.f32 q4, q5, q6
- vrsqrts.f32 q4, q4, q6
- vmul.f32 q4, q6, q4
-
- vmul.f32 q5, q5, q4
-
- vst1.32 {q5}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, -, -, - };
- @ q1 = { V.y, -, -, - };
- @ q2 = { V.z, -, -, - };
- subs r3, r3, #1
-
- vmul.f32 q0, q0, q0 @ V.x^2
- vmla.f32 q0, q1, q1 @ V.x^2 + V.y^2
- vmla.f32 q0, q2, q2 @ V.x^2 + V.y^2 + V.z^2
- vmla.f32 q0, q3, q3 @ V.x^2 + V.y^2 + V.z^2 + V.w^2
-
- @ get SQRT of the vector
- vrsqrte.f32 q2, q0
- vmul.f32 q1, q0, q2
- vrsqrts.f32 q1, q1, q2
- vmul.f32 q1, q2, q1
-
- vmul.f32 q0, q0, q1
-
- vst1.32 d0[0], [r0]!
-
- bgt .L_secondloop_vec4
-
-.L_return_vec4:
- @ return
- mov r0, #0
- bx lr
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_len_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_len_operation_x.h"
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) len_vec2f_c;
- ftbl[ 1] = (arm_func_3args_t) len_vec2f_asm;
- ftbl[ 2] = (arm_func_3args_t) len_vec2f_neon;
-
- ftbl[ 3] = (arm_func_3args_t) len_vec3f_c;
- ftbl[ 4] = (arm_func_3args_t) len_vec3f_asm;
- ftbl[ 5] = (arm_func_3args_t) len_vec3f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) len_vec4f_c;
- ftbl[ 7] = (arm_func_3args_t) len_vec4f_asm;
- ftbl[ 8] = (arm_func_3args_t) len_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mla.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global mla_float_asm
- .thumb
- .thumb_func
-
-mla_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mla_float(arm_vec2f_t * dst, arm_float_t * acc,
- @ arm_float_t * src1, const arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current src1 entry's address - made of base(r0)+offset
- @ r1: *acc & current acc entry's address - made of base(r1)+offset
- @ r2: *src1 & current src1 entry's address - made of base(r2)+offset
- @ r3: *src2 & current src2 entry's address - made of base(r3)+offset
- @ r4: int count
- @
- @ r4: loop counter
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- ldr r4, [r13, #4] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
- cbz r4, .LoopEndFloat
-
-.LoopBeginFloat:
- vldr s10, [r1] @ Load s10 = acc[i]
- vldr s1, [r2] @ Load s1 = src1[i]
- vldr s2, [r3] @ Load s2 = src2[i]
- add r1, r1, #4 @ move to the next acc entry
- add r2, r2, #4 @ move to the next src1 entry
- add r3, r3, #4 @ next entry in src2
- vmla.f32 s10, s1, s2 @ s10 = acc[i] + (src1[i] * src2[i])
- vstr s10, [r0] @ Store the result back into the main memory
- add r0, r0, #4 @ next entry in the dst
- subs r4, r4, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mla.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t mla_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ] = acc[ itr ] + (src1[ itr ] * src2[ itr ]);
- );
-}
-
-arm_result_t vmla_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = acc[ itr ].x + (src1[ itr ].x * src2[ itr ].x);
- dst[ itr ].y = acc[ itr ].y + (src1[ itr ].y * src2[ itr ].y);
- );
-}
-
-arm_result_t vmla_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = acc[ itr ].x + (src1[ itr ].x * src2[ itr ].x);
- dst[ itr ].y = acc[ itr ].y + (src1[ itr ].y * src2[ itr ].y);
- dst[ itr ].z = acc[ itr ].z + (src1[ itr ].z * src2[ itr ].z);
- );
-}
-
-arm_result_t vmla_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = acc[ itr ].x + (src1[ itr ].x * src2[ itr ].x);
- dst[ itr ].y = acc[ itr ].y + (src1[ itr ].y * src2[ itr ].y);
- dst[ itr ].z = acc[ itr ].z + (src1[ itr ].z * src2[ itr ].z);
- dst[ itr ].w = acc[ itr ].w + (src1[ itr ].w * src2[ itr ].w);
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mla.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .balign 4
- .global mla_float_neon
- .thumb
- .thumb_func
-
-mla_float_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mla_float(arm_float_t * dst,
- @ arm_float_t * acc,
- @ arm_float_t * src1,
- @ arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *acc & current acc entry's address
- @ r2: *src1 & current src1 entry's address
- @ r3: *src2 & current src2 entry's address
- @ r4: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r5: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5}
- ldr r4, [r13, #8] @ r4 = count; r13 is the stack pointer (sp)
-
- and r5, r4, #3 @ r5 = count % 4; ; This is what's left to be processed after this loop
- sub r4, r4, r5 @ count = count - r5
-
- cbz r4, .L_check_float
-
- @ load the 1st set of values
- vld1.32 {q0}, [r2]!
- vld1.32 {q1}, [r3]!
- vld1.32 {q3}, [r1]!
- subs r4, r4, #4
-
- @ calculate values for the 1st set
- vmla.f32 q3, q0, q1 @ q3 += q0 * q1
-
- ble .L_mainloopend_float
-
-.L_mainloop_float:
- @ load the next (e.g. 2nd) set of values, leave loading acc until later
- vld1.32 {q0}, [r2]!
- vld1.32 {q1}, [r3]!
-
- @ store the result for the 1st/next (e.g. 2nd) set
- vst1.32 {d6,d7}, [r0]!
-
- @ load the next (e.g. 2nd) acc, and decrease the counter
- vld1.32 {q3}, [r1]!
- subs r4, r4, #4
-
- @ calculate values for the next (e.g. 2nd) set
- vmla.f32 q3, q0, q1 @ q3 += q0 * q1
-
- bgt .L_mainloop_float @ loop if r4 > 0, if we have at least another 4 floats
-
-.L_mainloopend_float:
- @ the last iteration for this call
- @ store the result for the last set of values (e.g 2nd set)
- vst1.32 {d6,d7}, [r0]!
-
-.L_check_float:
- @ check if anything left to process at the end of the input array
- cmp r5, #0
- ble .L_return_float
-
-.L_secondloop_float:
- @ process the last few items left in the input array
- vld1.f32 d0[0], [r2]! @ Fill in d0[0]
- vld1.f32 d1[0], [r3]! @ Fill in d1[0]
- vld1.f32 d2[0], [r1]! @ Fill in d2[0]
-
- subs r5, r5, #1
-
- @ values
- vmla.f32 d2, d0, d1
-
- vst1.32 {d2[0]}, [r0]!
-
- bgt .L_secondloop_float
-
-.L_return_float:
- @ return
- pop {r4, r5}
- mov r0, #0
- bx lr
-
-
-
-
- .balign 4
- .global vmla_vec2f_neon
- .thumb
- .thumb_func
-
-vmla_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t vmla_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * acc,
- @ arm_vec2f_t * src1,
- @ arm_vec2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *acc & current acc entry's address
- @ r2: *src1 & current src1 entry's address
- @ r3: *src2 & current src2 entry's address
- @ r4: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r5: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5}
- ldr r4, [r13, #8] @ r5 = count; r13 is the stack pointer (sp)
-
- and r5, r4, #3 @ r5 = count % 4;
- sub r4, r4, r5 @ count = count - r4; This is what's left to be processed after this loop
-
- cbz r4, .L_check_vec2
-
- @ load the 1st set of values
- vld2.32 {q0-q1}, [r2]!
- vld2.32 {q2-q3}, [r3]!
- vld2.32 {q8-q9}, [r1]!
- subs r4, r4, #4
-
- @ calculate values for the 1st set
- vmla.f32 q8, q0, q2
- vmla.f32 q9, q1, q3
-
- ble .L_mainloopend_vec2
-
-.L_mainloop_vec2:
- @ load the 2nd set of values
- vld2.32 {q0-q1}, [r2]!
- vld2.32 {q2-q3}, [r3]!
-
- @ store the result for the 1st/next (e.g. 2nd) set
- vst2.32 {d16,d17,d18,d19}, [r0]!
-
- @ load the next (e.g. 2nd) set of values
- vld2.32 {q8-q9}, [r1]!
- subs r4, r4, #4
-
- @ calculate values for the 2nd set
- vmla.f32 q8, q0, q2
- vmla.f32 q9, q1, q3
-
- bgt .L_mainloop_vec2 @ loop if r3 is > r4, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_vec2:
- @ the last iteration for this call
- @ store the result for the last set of values
- vst2.32 {d16,d17,d18,d19}, [r0]!
-
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r5, #0
- ble .L_return_vec2
-
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r2]!
- vld1.f32 d1, [r3]!
- vld1.f32 d2, [r1]!
-
- subs r5, r5, #1
-
- @ calculate values
- vmla.f32 d2, d0, d1
-
- vst1.32 {d2}, [r0]!
-
- bgt .L_secondloop_vec2
-
-.L_return_vec2:
- @ return
- pop {r4, r5}
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global vmla_vec3f_neon
- .thumb
- .thumb_func
-vmla_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t vmla_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * acc,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *acc & current acc entry's address
- @ r2: *src1 & current src1 entry's address
- @ r3: *src2 & current src2 entry's address
- @ r4: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r5: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5}
- ldr r4, [r13, #8] @ r4 = count; r13 is the stack pointer (sp)
-
- and r5, r4, #3 @ r4 = count % 4;
- sub r4, r4, r5 @ count = count - r4; This is what's left to be processed after this loop
-
- cmp r4, #0
- beq .L_check_vec3
-
- @ load the 1st set of values
- vld3.32 {d0, d2, d4}, [r2]!
- vld3.32 {d1, d3, d5}, [r2]!
- vld3.32 {d18, d20, d22}, [r3]!
- vld3.32 {d19, d21, d23}, [r3]!
- vld3.32 {d24, d26, d28}, [r1]! @ part of q12, q13, and q14
- vld3.32 {d25, d27, d29}, [r1]! @ part of q12, q13, and q14
- subs r4, r4, #4
-
- @ calculate values for the 1st set
- vmla.f32 q12, q0, q9
- vmla.f32 q13, q1, q10
- vmla.f32 q14, q2, q11
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ load the next (e.g. 2nd) set of values
- vld3.32 {d0, d2, d4}, [r2]!
- vld3.32 {d1, d3, d5}, [r2]!
- vld3.32 {d18, d20, d22}, [r3]!
- vld3.32 {d19, d21, d23}, [r3]!
-
- @ store the result for the 1st/next (e.g. 2nd) set
- vst3.32 {d24, d26, d28}, [r0]!
- vst3.32 {d25, d27, d29}, [r0]!
-
- @ finish loading ...
- vld3.32 {d24, d26, d28}, [r1]! @ part of q12, q13, and q14
- vld3.32 {d25, d27, d29}, [r1]! @ part of q12, q13, and q14
- subs r4, r4, #4
-
- @ calculate values for the next (e.g. 2nd) set
- vmla.f32 q12, q0, q9
- vmla.f32 q13, q1, q10
- vmla.f32 q14, q2, q11
-
- bgt .L_mainloop_vec3 @ loop if r3 is > r4, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
- @ store the result for the last set of value
- vst3.32 {d24, d26, d28}, [r0]!
- vst3.32 {d25, d27, d29}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r5, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r3]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
- vld3.f32 {d18[0], d20[0], d22[0]}, [r1]! @ The values are loaded like so:
- @ q9 = { acc.x, -, -, - };
- @ q10 = { acc.y, -, -, - };
- @ q11 = { acc.z, -, -, - };
-
- subs r5, r5, #1
-
- @ calculate values for
- vmla.f32 d18, d0, d1
- vmla.f32 d20, d2, d3
- vmla.f32 d22, d4, d5
-
- vst3.32 {d18[0], d20[0], d22[0]}, [r0]!
-
- bgt .L_secondloop_vec3
-
-.L_return_vec3:
- @ return
- pop {r4, r5}
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global vmla_vec4f_neon
- .thumb
- .thumb_func
-vmla_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t vmla_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * acc,
- @ arm_vec4f_t * src1,
- @ arm_vec4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *acc & current acc entry's address
- @ r2: *src1 & current src1 entry's address
- @ r3: *src2 & current src2 entry's address
- @ r4: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r5: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5}
- ldr r4, [r13, #8] @ r4 = count; r13 is the stack pointer (sp)
-
- and r5, r4, #3 @ r5 = count % 4;
- sub r4, r4, r5 @ count = count - r5; This is what's left to be processed after this loop
-
- cmp r4, #0
- beq .L_check_vec4
-
- @ load the 1st set of values
- vld4.32 {d0, d2, d4, d6}, [r2]!
- vld4.32 {d1, d3, d5, d7}, [r2]!
- vld4.32 {d16, d18, d20, d22}, [r3]!
- vld4.32 {d17, d19, d21, d23}, [r3]!
- vld4.32 {d24, d26, d28, d30}, [r1]! @ part of q12, q13, q14, and q15
- vld4.32 {d25, d27, d29, d31}, [r1]! @ part of q12, q13, q14, and q15
- subs r4, r4, #4
-
- @ calculate values for the 1st set
- vmla.f32 q12, q0, q8
- vmla.f32 q13, q1, q9
- vmla.f32 q14, q2, q10
- vmla.f32 q15, q3, q11
-
- ble .L_mainloopend_vec4
-
-.L_mainloop_vec4:
- @ load the next (e.g. 2nd) set of values
- vld4.32 {d0, d2, d4, d6}, [r2]!
- vld4.32 {d1, d3, d5, d7}, [r2]!
- vld4.32 {d16, d18, d20, d22}, [r3]!
- vld4.32 {d17, d19, d21, d23}, [r3]!
-
- @ store the result for the 1st/next (e.g. 2nd) set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
-
- @ finish loading ....
- vld4.32 {d24, d26, d28, d30}, [r1]! @ part of q12, q13, q14, and q15
- vld4.32 {d25, d27, d29, d31}, [r1]! @ part of q12, q13, q14, and q15
- subs r4, r4, #4
-
- @ calculate values for the next (e.g. 2nd) set
- vmla.f32 q12, q0, q8
- vmla.f32 q13, q1, q9
- vmla.f32 q14, q2, q10
- vmla.f32 q15, q3, q11
-
- bgt .L_mainloop_vec4 @ loop if r3 is > r4, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
- @ store the result for the last set of values
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r5, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- @ q3 = { V1.w, -, -, - };
- vld4.f32 {d1[0], d3[0], d5[0], d7[0]}, [r3]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
- @ q3 = { V1.w, -, V2.w, - };
- vld4.f32 {d24[0], d26[0], d28[0], d30[0]}, [r1]! @ The values are loaded like so:
- @ q12 = { acc.x, -, -, - };
- @ q13 = { acc.y, -, -, - };
- @ q14 = { acc.z, -, -, - };
- @ q15 = { acc.w, -, -, - };
-
- subs r5, r5, #1
-
- @ calculate values
- vmla.f32 d24, d0, d1
- vmla.f32 d26, d2, d3
- vmla.f32 d28, d4, d5
- vmla.f32 d30, d6, d7
-
- vst4.32 {d24[0], d26[0], d28[0], d30[0]}, [r0]!
-
- bgt .L_secondloop_vec4
-
-.L_return_vec4:
- @ return
- pop {r4, r5}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mla_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_mla_operation_x.h"
-
-arm_result_t mla_float_c (arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-//arm_result_t mla_float_asm (arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-arm_result_t mla_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-
-arm_result_t vmla_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-//arm_result_t vmla_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-arm_result_t vmla_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-
-arm_result_t vmla_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//arm_result_t vmla_vec3f_asm (arm_vec3f_t * dst, arm_vec4f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-arm_result_t vmla_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-arm_result_t vmla_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-//arm_result_t vmla_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-arm_result_t vmla_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_5args_t) mla_float_c;
- ftbl[ 1] = (arm_func_5args_t) mla_float_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_5args_t) mla_float_neon;
-
- ftbl[ 3] = (arm_func_5args_t) vmla_vec2f_c;
- ftbl[ 4] = (arm_func_5args_t) vmla_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_5args_t) vmla_vec2f_neon;
-
- ftbl[ 6] = (arm_func_5args_t) vmla_vec3f_c;
- ftbl[ 7] = (arm_func_5args_t) vmla_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_5args_t) vmla_vec3f_neon;
-
- ftbl[ 9] = (arm_func_5args_t) vmla_vec4f_c;
- ftbl[10] = (arm_func_5args_t) vmla_vec4f_c; // using the c version in place of the assembly version
- ftbl[11] = (arm_func_5args_t) vmla_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mlac.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global mlac_float_asm
- .thumb
- .thumb_func
-
-mlac_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mlac_float(arm_vec2f_t * dst, arm_vec2f_t * acc,
- @ arm_float_t * src, const arm_float_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *acc
- @ r2: *src
- @ r3: cst
- @ r4: int count
- @
- @ r4: loop counter
- @ r5: current item's offset in acc[], src[], and dst[]
- @ r6: current accumulator item's address made of base(r1)+offset(r5)
- @ r7: current source item's address made of base(r2)+offset(r5)
- @ r8: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7, r8}
- ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
- cbz r4, .LoopEndFloat
- mov r5, #0
-
-.LoopBeginFloat:
- add r6, r1, r5 @ Get current accumulator item's address in memory
- vldr s10, [r6, #0] @ Load acc[i]
- add r7, r2, r5 @ Get current source item's address in memory
- vldr s2, [r7, #0] @ Load src[i]
- vmov s3, r3 @ Get cst into register s3
- vmla.f32 s10, s2, s3 @ s10 = acc[i] + ( src[i] * cst )
- add r8, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r8, #0] @ Store the result back into the main memory
- add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
- subs r4, r4, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7, r8}
- bx lr
-
-
-
-
- .balign 4
- .global mlac_vec2f_asm
- .thumb
- .thumb_func
-
-mlac_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mlac_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *acc
- @ r2: *src
- @ r3: *cst
- @ r4: int count
- @
- @ r4: loop counter
- @ r5: current item's offset in acc[], src[], and dst[]
- @ r6: current accumulator item's address made of base(r1)+offset(r5)
- @ r7: current source item's address made of base(r2)+offset(r5)
- @ r8: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7, r8}
- ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
- cbz r4, .LoopEndVec2F
- mov r5, #0
-
-.LoopBeginVec2F:
- add r6, r1, r5 @ Get current accumulator item's address in memory
- vldr s10, [r6, #0] @ Load acc[i].x and acc[i].y
- vldr s11, [r6, #4]
- add r7, r2, r5 @ Get current source item's address in memory
- vldr s1, [r7, #0] @ Load src[i].x and src[i].y
- vldr s2, [r7, #4]
- vldr s3, [r3, #0] @ Load cst->x and cst->y
- vldr s4, [r3, #4]
- vmla.f32 s10, s1, s3 @ s10 = acc[i].x + ( src[i].x * cst->x )
- vmla.f32 s11, s2, s4
- add r8, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r8, #0] @ Store the results back into the main memory
- vstr s11, [r8, #4]
- add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
- subs r4, r4, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7, r8}
- bx lr
-
-
-
-
- .balign 4
- .global mlac_vec3f_asm
- .thumb
- .thumb_func
-
-mlac_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mlac_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *acc
- @ r2: *src
- @ r3: *cst
- @ r4: int count
- @
- @ r4: loop counter
- @ r5: current item's offset in acc[], src[], and dst[]
- @ r6: current accumulator item's address made of base(r1)+offset(r5)
- @ r7: current source item's address made of base(r2)+offset(r5)
- @ r8: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7, r8}
- ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
- cbz r4, .LoopEndVec3F
- mov r5, #0
-
-.LoopBeginVec3F:
- add r6, r1, r5 @ Get current accumulator item's address in memory
- vldr s10, [r6, #0] @ Load acc[i].x, acc[i].y , and acc[i].z
- vldr s11, [r6, #4]
- vldr s12, [r6, #8]
- add r7, r2, r5 @ Get current source item's address in memory
- vldr s1, [r7, #0] @ Load src[i].x, src[i].y , and src[i].z
- vldr s2, [r7, #4]
- vldr s3, [r7, #8]
- vldr s4, [r3, #0] @ Load cst->x, cst->y, and cst->z
- vldr s5, [r3, #4]
- vldr s6, [r3, #8]
- vmla.f32 s10, s1, s4 @ s10 = acc[i].x + ( src[i].x * cst->x )
- vmla.f32 s11, s2, s5 @ same for 'y'
- vmla.f32 s12, s3, s6 @ same for 'z'
- add r8, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r8, #0] @ Store the results back into the main memory
- vstr s11, [r8, #4]
- vstr s12, [r8, #8]
- add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
- subs r4, r4, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7, r8}
- bx lr
-
-
-
-
- .balign 4
- .global mlac_vec4f_asm
- .thumb
- .thumb_func
-
-mlac_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mlac_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *acc
- @ r2: *src
- @ r3: *cst
- @ r4: int count
- @
- @ r4: loop counter
- @ r5: current item's offset in acc[], src[], and dst[]
- @ r6: current accumulator item's address made of base(r1)+offset(r5)
- @ r7: current source item's address made of base(r2)+offset(r5)
- @ r8: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7, r8}
- ldr r4, [r13, #20] @ r4 = cst ( off the stack pointer (sp) - which is r13 )
- cbz r4, .LoopEndVec4F
- mov r5, #0
-
-.LoopBeginVec4F:
- add r6, r1, r5 @ Get current accumulator item's address in memory
- vldr s10, [r6, #0] @ Load acc[i].x, acc[i].y , acc[i].z, and w
- vldr s11, [r6, #4]
- vldr s12, [r6, #8]
- vldr s13, [r6, #12]
- add r7, r2, r5 @ Get current source item's address in memory
- vldr s1, [r7, #0] @ Load src[i].x, src[i].y , src[i].z, and w
- vldr s2, [r7, #4]
- vldr s3, [r7, #8]
- vldr s4, [r7, #12]
- vldr s5, [r3, #0] @ Load cst->x, cst->y, cst->z, and w
- vldr s6, [r3, #4]
- vldr s7, [r3, #8]
- vldr s8, [r3, #12]
- vmla.f32 s10, s1, s5 @ s10 = acc[i].x + ( src[i].x * cst->x )
- vmla.f32 s11, s2, s6 @ same for 'y'
- vmla.f32 s12, s3, s7 @ same for 'z'
- vmla.f32 s13, s4, s8 @ same for 'w'
- add r8, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r8, #0] @ Store the results back into the main memory
- vstr s11, [r8, #4]
- vstr s12, [r8, #8]
- vstr s13, [r8, #12]
- add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
- subs r4, r4, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7, r8}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mlac.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t mlac_float_c(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_X_C
- (
- dst[ itr ] = acc[ itr ] + (src[ itr ] * cst);
- );
-}
-
-arm_result_t mlac_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_X_C
- (
- dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
- dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
- );
-}
-
-arm_result_t mlac_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_X_C
- (
- dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
- dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
- dst[ itr ].z = acc[ itr ].z + (src[ itr ].z * cst->z);
- );
-}
-
-arm_result_t mlac_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_X_C
- (
- dst[ itr ].x = acc[ itr ].x + (src[ itr ].x * cst->x);
- dst[ itr ].y = acc[ itr ].y + (src[ itr ].y * cst->y);
- dst[ itr ].z = acc[ itr ].z + (src[ itr ].z * cst->z);
- dst[ itr ].w = acc[ itr ].w + (src[ itr ].w * cst->w);
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mlac.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
-
-arm_result_t mlac_float_neon(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_FLOAT_NEON
- (
- n_dst = vmlaq_f32( n_acc, n_src, n_cst );
- ,
- n_tmp_src = vmla_f32( n_tmp_acc, n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t mlac_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_VEC2F_NEON
- (
- n_dst = vmlaq_f32( n_acc, n_src , n_cst );
- ,
- n_tmp_src = vmla_f32( n_tmp_acc, n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t mlac_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_VEC3F_NEON
- (
- n_dst1 = vmlaq_f32( n_acc1, n_src1 , n_cst1 );
- n_dst2 = vmlaq_f32( n_acc2, n_src2 , n_cst2 );
- n_dst3 = vmlaq_f32( n_acc3, n_src3 , n_cst3 );
- ,
- n_tmp_src.val[0] = vmla_f32( n_tmp_acc.val[0], n_tmp_src.val[0], n_tmp_cst.val[0] ); /* the X lane */
- n_tmp_src.val[1] = vmla_f32( n_tmp_acc.val[1], n_tmp_src.val[1], n_tmp_cst.val[1] ); /* the Y lane */
- n_tmp_src.val[2] = vmla_f32( n_tmp_acc.val[2], n_tmp_src.val[2], n_tmp_cst.val[2] ); /* the Z lane */
- );
-}
-
-arm_result_t mlac_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_MLAC_OPERATION_VEC4F_NEON
- (
- n_dst = vmlaq_f32( n_acc, n_src , n_cst );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mlac_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_mlac_operation_x.h"
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_5args_t) mlac_float_c;
- ftbl[ 1] = (arm_func_5args_t) mlac_float_asm;
- ftbl[ 2] = (arm_func_5args_t) mlac_float_neon;
-
- ftbl[ 3] = (arm_func_5args_t) mlac_vec2f_c;
- ftbl[ 4] = (arm_func_5args_t) mlac_vec2f_asm;
- ftbl[ 5] = (arm_func_5args_t) mlac_vec2f_neon;
-
- ftbl[ 6] = (arm_func_5args_t) mlac_vec3f_c;
- ftbl[ 7] = (arm_func_5args_t) mlac_vec3f_asm;
- ftbl[ 8] = (arm_func_5args_t) mlac_vec3f_neon;
-
- ftbl[ 9] = (arm_func_5args_t) mlac_vec4f_c;
- ftbl[10] = (arm_func_5args_t) mlac_vec4f_asm;
- ftbl[11] = (arm_func_5args_t) mlac_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mul.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global mul_float_asm
- .thumb
- .thumb_func
-
-mul_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mul_float(arm_vec2f_t * dst,
- @ arm_float_t * src1, const arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
- @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
- @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
- @ r3: int count
- @
- @ r3: loop counter
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r3, .LoopEndFloat
-
-.LoopBeginFloat:
- vldr s1, [r1] @ Load s1 = src1[i]
- add r1, r1, #4 @ move to the next entry
- vldr s2, [r2] @ Load s2 = src2[i]
- add r2, r2, #4 @ next entry
- vmul.f32 s10, s1, s2 @ s10 = src1[i] * src2[i]
- vstr s10, [r0] @ Store the result back into the main memory
- add r0, r0, #4 @ next entry in the dst
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mul.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t mul_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ] = src1[ itr ] * src2[ itr ];
- );
-}
-
-arm_result_t vmul_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
- );
-}
-
-arm_result_t vmul_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z * src2[ itr ].z;
- );
-}
-
-arm_result_t vmul_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x * src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y * src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z * src2[ itr ].z;
- dst[ itr ].w = src1[ itr ].w * src2[ itr ].w;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mul.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global mul_float_neon
- .thumb
- .thumb_func
-
-mul_float_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mul_float(arm_float_t * dst,
- @ arm_float_t * src1,
- @ arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
-
- cbz r3, .L_check_float
-
- @ load the current set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the current set
- vmul.f32 q3, q0, q1 @ q3 = q0 * q1
-
- ble .L_mainloopend_float
-
-.L_mainloop_float:
- @ store the result for the current set
- vst1.32 {d6,d7}, [r0]!
-
- @ load the next set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vmul.f32 q3, q0, q1 @ q3 = q0 * q1
-
- bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
-
-.L_mainloopend_float:
- @ the last iteration for this call
- @ store the result for the last one
- vst1.32 {d6,d7}, [r0]!
-
-.L_check_float:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_float
-
-.L_secondloop_float:
- @ process the last few items left in the input array
- vld1.f32 d0[0], [r1]! @ Fill in d0[0]
- vld1.f32 d1[0], [r2]! @ Fill in d1[1]
-
- subs r4, r4, #1
-
- @ values
- vmul.f32 d0, d0, d1
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_secondloop_float
-
-.L_return_float:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global vmul_vec2f_neon
- .thumb
- .thumb_func
-
-vmul_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mul_float(arm_vec2f_t * dst,
- @ arm_vec2f_t * src1,
- @ arm_vec2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- cbz r3, .L_check_vec2
-
- @ load the 1st set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- vmul.f32 q4, q0, q2
- vmul.f32 q5, q1, q3
-
- ble .L_mainloopend_vec2
-
-.L_mainloop_vec2:
- @ store the result for the current set
- vst2.32 {d8,d9,d10,d11}, [r0]!
-
- @ load the next set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vmul.f32 q4, q0, q2
- vmul.f32 q5, q1, q3
-
- bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_vec2:
- @ the last iteration for this call
- @ store the result for the last set
- vst2.32 {d8,d9,d10,d11}, [r0]!
-
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec2
-
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r1]!
- vld1.f32 d1, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- vmul.f32 d0, d0, d1
-
- vst1.32 {d0}, [r0]!
-
- bgt .L_secondloop_vec2
-
-.L_return_vec2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global vmul_vec3f_neon
- .thumb
- .thumb_func
-vmul_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mul_float(arm_vec3f_t * dst,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r3 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_vec3
-
- @ load the 1st set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d6, d8, d10}, [r2]!
- vld3.32 {d7, d9, d11}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the 1st set
- vmul.f32 q10, q0, q3
- vmul.f32 q11, q1, q4
- vmul.f32 q12, q2, q5
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ store the result for the current set
- vst3.32 {d20, d22, d24}, [r0]!
- vst3.32 {d21, d23, d25}, [r0]!
-
- @ load the next set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d6, d8, d10}, [r2]!
- vld3.32 {d7, d9, d11}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vmul.f32 q10, q0, q3
- vmul.f32 q11, q1, q4
- vmul.f32 q12, q2, q5
-
- bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
- @ store the result for the last set
- vst3.32 {d20, d22, d24}, [r0]!
- vst3.32 {d21, d23, d25}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
-
- subs r4, r4, #1
-
- @ calculate values for
- vmul.f32 d0, d0, d1
- vmul.f32 d2, d2, d3
- vmul.f32 d4, d4, d5
-
- vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
-
- bgt .L_secondloop_vec3
-
-.L_return_vec3:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global vmul_vec4f_neon
- .thumb
- .thumb_func
-vmul_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mul_float(arm_vec4f_t * dst,
- @ arm_vec4f_t * src1,
- @ arm_vec4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_vec4
-
- @ load the 1st set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d8, d10, d12, d14}, [r2]!
- vld4.32 {d9, d11, d13, d15}, [r2]!
-
- subs r3, r3, #4
-
- @ calculate values for the 1st set
- vmul.f32 q10, q0, q4
- vmul.f32 q11, q1, q5
- vmul.f32 q12, q2, q6
- vmul.f32 q13, q3, q7
-
- ble .L_mainloopend_vec4
-
-.L_mainloop_vec4:
- @ store the result for current set
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d8, d10, d12, d14}, [r2]!
- vld4.32 {d9, d11, d13, d15}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vmul.f32 q10, q0, q4
- vmul.f32 q11, q1, q5
- vmul.f32 q12, q2, q6
- vmul.f32 q13, q3, q7
-
- bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
- @ store the result for the last set
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, V1.y, V1.z, V1.w };
- vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
- @ q1 = { V2.x, V2.y, V2.z, V2.w };
-
- subs r4, r4, #1
-
- @ calculate values
- vmul.f32 q0, q0, q1
-
- vst1.32 {d0, d1}, [r0]!
-
- bgt .L_secondloop_vec4
-
-.L_return_vec4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mul_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_x_operation_x.h"
-
-extern arm_result_t mul_float_c (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-//extern arm_result_t mul_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-extern arm_result_t mul_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-
-extern arm_result_t vmul_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-//extern arm_result_t vmul_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-
-extern arm_result_t vmul_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//extern arm_result_t vmul_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-extern arm_result_t vmul_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-//extern arm_result_t vmul_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-extern arm_result_t vmul_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) mul_float_c;
- ftbl[ 1] = (arm_func_4args_t) mul_float_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) mul_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) vmul_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) vmul_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) vmul_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) vmul_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) vmul_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) vmul_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) vmul_vec4f_c;
- ftbl[10] = (arm_func_4args_t) vmul_vec4f_c; // using the c version in place of the assembly version
- ftbl[11] = (arm_func_4args_t) vmul_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mulc.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global mulc_float_asm
- .thumb
- .thumb_func
-
-mulc_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulc_float(arm_vec2f_t * dst,
- @ arm_float_t * src, const arm_float_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndFloat
- mov r5, #0
-
-.LoopBeginFloat:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i]
- vmov s3, r2 @ Get cst into register s3
- vmul.f32 s10, s1, s3 @ s10 = src[i] * cst
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the result back into the main memory
- add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global mulc_vec2f_asm
- .thumb
- .thumb_func
-
-mulc_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulc_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec2F
- mov r5, #0
-
-.LoopBeginVec2F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x and src[i].y
- vldr s2, [r6, #4]
- vldr s3, [r2, #0] @ Load cst->x and cst->y
- vldr s4, [r2, #4]
- vmul.f32 s10, s1, s3 @ s10 = src[i].x * cst->x
- vmul.f32 s11, s2, s4 @ s11 = src[i].y * cst->y
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global mulc_vec3f_asm
- .thumb
- .thumb_func
-
-mulc_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulc_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec3F
- mov r5, #0
-
-.LoopBeginVec3F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
- vldr s5, [r2, #4]
- vldr s6, [r2, #8]
- vmul.f32 s10, s1, s4 @ s10 = src[i].x * cst->x
- vmul.f32 s11, s2, s5 @ s11 = src[i].y * cst->y
- vmul.f32 s12, s3, s6 @ s12 = src[i].z * cst->z
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global mulc_vec4f_asm
- .thumb
- .thumb_func
-
-mulc_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulc_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec4F
- mov r5, #0
-
-.LoopBeginVec4F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r6, #12]
- vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
- vldr s6, [r2, #4]
- vldr s7, [r2, #8]
- vldr s8, [r2, #12]
- vmul.f32 s10, s1, s5 @ s10 = src[i].x * cst->x
- vmul.f32 s11, s2, s6 @ s11 = src[i].y * cst->y
- vmul.f32 s12, s3, s7 @ s12 = src[i].z * cst->z
- vmul.f32 s13, s4, s8 @ s13 = src[i].w * cst->w
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- vstr s13, [r7, #12]
- add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mulc.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t mulc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ] = src[ itr ] * cst;
- );
-}
-
-arm_result_t mulc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x * cst->x;
- dst[ itr ].y = src[ itr ].y * cst->y;
- );
-}
-
-arm_result_t mulc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x * cst->x;
- dst[ itr ].y = src[ itr ].y * cst->y;
- dst[ itr ].z = src[ itr ].z * cst->z;
- );
-}
-
-arm_result_t mulc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x * cst->x;
- dst[ itr ].y = src[ itr ].y * cst->y;
- dst[ itr ].z = src[ itr ].z * cst->z;
- dst[ itr ].w = src[ itr ].w * cst->w;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mulc.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
-
-arm_result_t mulc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_FLOAT_NEON
- (
- n_dst = vmulq_f32( n_src , n_cst );
- ,
- n_tmp_src = vmul_f32( n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t mulc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC2F_NEON
- (
- n_dst = vmulq_f32( n_src , n_cst );
- ,
- n_tmp_src = vmul_f32( n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t mulc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC3F_NEON
- (
- n_dst1 = vmulq_f32( n_src1 , n_cst1 );
- n_dst2 = vmulq_f32( n_src2 , n_cst2 );
- n_dst3 = vmulq_f32( n_src3 , n_cst3 );
- ,
- n_tmp_src.val[0] = vmul_f32( n_tmp_src.val[0], n_tmp_cst.val[0] );
- n_tmp_src.val[1] = vmul_f32( n_tmp_src.val[1], n_tmp_cst.val[1] );
- n_tmp_src.val[2] = vmul_f32( n_tmp_src.val[2], n_tmp_cst.val[2] );
- );
-}
-
-arm_result_t mulc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC4F_NEON
- (
- n_dst = vmulq_f32( n_src , n_cst );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mulc_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xc_operation_x.h"
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) mulc_float_c;
- ftbl[ 1] = (arm_func_4args_t) mulc_float_asm;
- ftbl[ 2] = (arm_func_4args_t) mulc_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) mulc_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) mulc_vec2f_asm;
- ftbl[ 5] = (arm_func_4args_t) mulc_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) mulc_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) mulc_vec3f_asm;
- ftbl[ 8] = (arm_func_4args_t) mulc_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) mulc_vec4f_c;
- ftbl[10] = (arm_func_4args_t) mulc_vec4f_asm;
- ftbl[11] = (arm_func_4args_t) mulc_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mulcmatvec.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mulcmatvec.neon.s
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t mulcmatvec_cm2x2f_v2f_c (arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count)
-{
- #define A1 cst->c1.r1
- #define B1 cst->c1.r2
- #define C1 cst->c2.r1
- #define D1 cst->c2.r2
-
- NE10_CMATVEC_OPERATION_X_C
- (
- dst[ itr ].x = A1 * src[ itr ].x + C1 * src[ itr ].y;
- dst[ itr ].y = B1 * src[ itr ].x + D1 * src[ itr ].y;
- );
-
- #undef A1
- #undef B1
- #undef C1
- #undef D1
-}
-
-arm_result_t mulcmatvec_cm3x3f_v3f_c (arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count)
-{
- #define A1 cst->c1.r1
- #define B1 cst->c1.r2
- #define C1 cst->c1.r3
- #define D1 cst->c2.r1
- #define E1 cst->c2.r2
- #define F1 cst->c2.r3
- #define G1 cst->c3.r1
- #define H1 cst->c3.r2
- #define I1 cst->c3.r3
-
- NE10_CMATVEC_OPERATION_X_C
- (
- dst[ itr ].x = A1 * src[ itr ].x + D1 * src[ itr ].y + G1 * src[ itr ].z;
- dst[ itr ].y = B1 * src[ itr ].x + E1 * src[ itr ].y + H1 * src[ itr ].z;
- dst[ itr ].z = C1 * src[ itr ].x + F1 * src[ itr ].y + I1 * src[ itr ].z;
- );
-
- #undef A1
- #undef B1
- #undef C1
- #undef D1
- #undef E1
- #undef F1
- #undef G1
- #undef H1
- #undef I1
-}
-
-extern arm_result_t mulcmatvec_cm4x4f_v4f_c (arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count)
-{
- #define A1 cst->c1.r1
- #define B1 cst->c1.r2
- #define C1 cst->c1.r3
- #define D1 cst->c1.r4
- #define E1 cst->c2.r1
- #define F1 cst->c2.r2
- #define G1 cst->c2.r3
- #define H1 cst->c2.r4
- #define I1 cst->c3.r1
- #define J1 cst->c3.r2
- #define K1 cst->c3.r3
- #define L1 cst->c3.r4
- #define M1 cst->c4.r1
- #define N1 cst->c4.r2
- #define O1 cst->c4.r3
- #define P1 cst->c4.r4
-
- NE10_CMATVEC_OPERATION_X_C
- (
- dst[ itr ].x = A1 * src[ itr ].x + E1 * src[ itr ].y + I1 * src[ itr ].z + M1 * src[ itr ].w;
- dst[ itr ].y = B1 * src[ itr ].x + F1 * src[ itr ].y + J1 * src[ itr ].z + N1 * src[ itr ].w;
- dst[ itr ].z = C1 * src[ itr ].x + G1 * src[ itr ].y + K1 * src[ itr ].z + O1 * src[ itr ].w;
- dst[ itr ].w = D1 * src[ itr ].x + H1 * src[ itr ].y + L1 * src[ itr ].z + P1 * src[ itr ].w;
- );
-
- #undef A1
- #undef B1
- #undef C1
- #undef D1
- #undef E1
- #undef F1
- #undef G1
- #undef H1
- #undef I1
- #undef J1
- #undef K1
- #undef L1
- #undef M1
- #undef N1
- #undef O1
- #undef P1
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mulcmatvec.neon.s
-@
-
-
-
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro multiplies a single 2x2 matrix by eight vec2's
- @ The elements of the vectors are loaded into registers q8-q11
- @ by the caller (mulcmatvec_cm2x2f_v2f_neon) in the following
- @ order:
- @
- @ d16=(x1,x3) d18=(y1,y3) d20=(x2,x4) d22=(y2,y4);
- @ d17=(x5,x7) d19=(y5,y7) d21=(x6,x8) d23=(y6,y8);
- @
- @ This macro multiplies these eight vectors by the 2x2 matrix
- @ which is stored in registers d0[0],d1[0],d2[0], and d3[0].
- @ The resulting eight vectors are returned in q12-q15
- @ in the same order as shown above.
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro MUL_MAT2x2_VEC2
- vmul.f32 q10, q8 , d0[0] @ a*x1,x2,x3,x4
- vmul.f32 q8 , q8 , d1[0] @ b*x1,x2,x3,x4
- vmul.f32 q11, q9 , d2[0] @ c*y1,y2,y3,y4
- vmul.f32 q9 , q9 , d3[0] @ d*y1,y2,y3,y4
-
- vadd.f32 q12, q10, q11 @ 3) res24.x = a*(x1,x2,x3,x4) + c*(y1,y2,y3,y4) @ These results need to be stored in the order noted
- vadd.f32 q13, q8, q9 @ 4) res24.y = b*(x1,x2,x3,x4) + d*(y1,y2,y3,y4)
- .endm
-
-
-
-
- .balign 4
- .global mulcmatvec_cm2x2f_v2f_neon
- .thumb
- .thumb_func
-
-mulcmatvec_cm2x2f_v2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulcmatvec_cm2x2f_v2f ( arm_vec2f_t * dst,
- @ const arm_mat2x2f_t * cst,
- @ arm_vec2f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ (this register is updated and mvoed to the next entry
- @ after every store operation)
- @ r1: *cst, memory pointer to where the constant matrix is kept
- @ r2: *src & current src entry's address
- @ r3: int count & the number of items in the input array
- @
- @ r4: the number of items that are left to be processed at the
- @ end of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- @ First we load the constant 2x2 matrix, then each time we load
- @ eight vectors of 2-floats, multiply each vector with the matrix,
- @ finally store the resutlting vectors in the destination memory
- @ address, and move on to the next four vectors.
-
- @ load the constant matrix
- @ d0 = m11(a) d2 = m12(c)
- @ d1 = m21(b) d3 = m22(d)
- vld4.32 { d0[0], d1[0], d2[0], d3[0] }, [r1]
-
- cmp r3, #0
- beq .L_check_mat2x2
-
- @ load the 1st set of values
- @ if {V1, V2, V3, V4} are 4 vec2's in memory
- @ then after the load operations the 4 vectors
- @ are stored in registers q8-q9 like so:
- @
- @ q8=(x1,x2,x3,x4)
- @ q9=(y1,y2,y3,y4)
-
- vld2.32 { d16, d17, d18, d19 }, [r2]!
-
- subs r3, r3, #4 @ 8 for this set
-
- @ calculate values for the 1st set
- MUL_MAT2x2_VEC2
-
- ble .L_mainloopend_mat2x2
-
-.L_mainloop_mat2x2:
- @ store the result for the current set
- vst2.32 { d24, d25, d26, d27 }, [r0]!
-
- @ load the next set of values
- vld2.32 { d16, d17, d18, d19 }, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- MUL_MAT2x2_VEC2
-
- bgt .L_mainloop_mat2x2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_mat2x2:
- @ the last iteration for this call
- @ store the result for the last set
- vst2.32 { d24, d25, d26, d27 }, [r0]!
-
-.L_check_mat2x2:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_mat2x2
-
-.L_secondloop_mat2x2:
- @ process the last few items left in the input array
- vld2.32 { d16[0], d18[0] }, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- MUL_MAT2x2_VEC2
-
- @ store the results
- vst2.32 { d24[0], d26[0] }, [r0]!
-
- bgt .L_secondloop_mat2x2
-
-.L_return_mat2x2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro to load four vec3's into registers q8-q10
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_FOUR_VEC3
- vld3.32 { d16, d18, d20 }, [r2]!
- vld3.32 { d17, d19, d21 }, [r2]!
- .endm
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro multiplies the constant 3x3 matrix loaded into
- @ registers d0-d5 by four vec3's that the above macro LOAD_FOUR_VEC3
- @ loads. The resuls are returned in registers q11, q12, and and q13
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro MUL_MAT3x3_VEC3
-
- vmul.f32 q11, q8 , d0[0]
- vmla.f32 q11, q9 , d0[1]
- vmla.f32 q11, q10, d1[0]
-
- vmul.f32 q12, q8 , d2[0]
- vmla.f32 q12, q9 , d2[1]
- vmla.f32 q12, q10, d3[0]
-
- vmul.f32 q13, q8 , d4[0]
- vmla.f32 q13, q9 , d4[1]
- vmla.f32 q13, q10, d5[0]
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro to store the resulting vec3's that were returned in
- @ registers q11 to q13 in the above macro MUL_MAT3x3_VEC3.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_FOUR_VEC3
-
- vst3.32 { d22, d24, d26 }, [r0]!
- vst3.32 { d23, d25, d27 }, [r0]!
-
- .endm
-
-
-
-
- .align 2
- .global mulcmatvec_cm3x3f_v3f_neon
- .thumb
- .thumb_func
-
-mulcmatvec_cm3x3f_v3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulcmatvec_cm3x3f_v3f ( arm_vec3f_t * dst,
- @ const arm_mat3x3f_t * cst,
- @ arm_vec3f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ (this register is updated and mvoed to the next entry
- @ after every store operation)
- @ r1: *cst, memory pointer to where the constant matrix is kep
- @ r2: *src & current src entry's gddress
- @ r3: int count & the number of items in the input array
- @
- @ r4: the number of items that are left to be processed at the
- @ end of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push { r4 }
- and r4, r3, #3 @ r3 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- @ First we load the constant 3x3 matrix, then each time we load
- @ four vectors of 3-floats, multiply each vector with the matrix,
- @ finally store the resutlting vectors in the destination memory
- @ address, and move on to the next four vectors.
-
- @ load the constant matrix into q0-q2
- vld3.32 { d0 , d2 , d4 }, [r1]!
- vld3.32 { d1[0], d3[0], d5[0] }, [r1]
-
- cmp r3, #0
- beq .L_check_mat3x3
-
-
- @ load the 1st set of values
- LOAD_FOUR_VEC3
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- MUL_MAT3x3_VEC3
-
- ble .L_mainloopend_mat3x3
-
-.L_mainloop_mat3x3:
- @ store the result for the current set
- STORE_FOUR_VEC3
-
- @ load the next set of values
- LOAD_FOUR_VEC3
- subs r3, r3, #4
-
- @ calculate values for the next set
- MUL_MAT3x3_VEC3
-
- bgt .L_mainloop_mat3x3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_mat3x3:
- @ the last iteration for this call
- @ store the result for the last set
- STORE_FOUR_VEC3
-
-.L_check_mat3x3:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_mat3x3
-
-.L_secondloop_mat3x3:
- @ process the last few items left in the input array
- vld3.32 { d16[0], d18[0], d20[0] }, [r2]!
-
- subs r4, r4, #1
-
- MUL_MAT3x3_VEC3
-
- vst3.32 { d22[0], d24[0], d26[0] }, [r0]!
-
- bgt .L_secondloop_mat3x3
-
-.L_return_mat3x3:
- @ return
- pop { r4 }
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro to load four vec4's into registers q8-q11.
- @ This macro uses r2 (the thirs parameter in
- @ mulcmatvec_cm4x4f_v4f_neon) as the address register.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_FOUR_VEC4
- vld4.32 { d16, d18, d20, d22 }, [r2]!
- vld4.32 { d17, d19, d21, d23 }, [r2]!
- .endm
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro multiplies the constant 4x4 matrix that is loaded
- @ in mulcmatvec_cm4x4f_v4f_neon by four vec4's that are loaded in
- @ the above macro LOAD_FOUR_VEC4.
- @ The resulting four vectors are returned in registers q12 to q15.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro MUL_MAT4x4_VEC4
-
- vmul.f32 q12, q8 , d0[0]
- vmla.f32 q12, q9 , d0[1]
- vmla.f32 q12, q10, d1[0]
- vmla.f32 q12, q11, d1[1]
-
- vmul.f32 q13, q8 , d2[0]
- vmla.f32 q13, q9 , d2[1]
- vmla.f32 q13, q10, d3[0]
- vmla.f32 q13, q11, d3[1]
-
- vmul.f32 q14, q8 , d4[0]
- vmla.f32 q14, q9 , d4[1]
- vmla.f32 q14, q10, d5[0]
- vmla.f32 q14, q11, d5[1]
-
- vmul.f32 q15, q8 , d6[0]
- vmla.f32 q15, q9 , d6[1]
- vmla.f32 q15, q10, d7[0]
- vmla.f32 q15, q11, d7[1]
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro stores the results from the above macro MUL_MAT4x4_VEC4
- @ from registers q12-q15 in to the destination memory (r0) which is
- @ the first parameter of mulcmatvec_cm4x4f_v4f_neon().
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_FOUR_VEC4
-
- vst4.32 { d24, d26, d28, d30 }, [r0]!
- vst4.32 { d25, d27, d29, d31 }, [r0]!
-
- .endm
-
-
-
-
- .align 2
- .global mulcmatvec_cm4x4f_v4f_neon
- .thumb
- .thumb_func
-
-mulcmatvec_cm4x4f_v4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulcmatvec_cm4x4f_v4f ( arm_vec4f_t * dst,
- @ const arm_mat4x4f_t * cst,
- @ arm_vec4f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ (this register is updated and mvoed to the next entry
- @ after every store operation)
- @ r1: *cst, pointer to memory where the constant matrix is kept
- @ r2: *src & current src entry's address
- @ r3: int count & the number of items in the input array
- @
- @ r4: the number of items that are left to be processed at the
- @ end of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
-
- @ First we load the constant 4x4 matrix, then each time we load
- @ four vectors of 4-floats, multiply each vector with the matrix,
- @ finally store the resutlting vectors in the destination memory
- @ address, and move on to the next four vectors.
-
- @ load the constant matrix into q0-q3
- vld4.32 { d0, d2, d4, d6 }, [r1]!
- vld4.32 { d1, d3, d5, d7 }, [r1]
-
- cmp r3, #0
- beq .L_check_mat4x4
-
- @ load the 1st set of values
- LOAD_FOUR_VEC4
- subs r3, r3, #4
-
- @ calculate values for the 1st set
- MUL_MAT4x4_VEC4
-
- ble .L_mainloopend_mat4x4
-
-.L_mainloop_mat4x4:
- @ store the result for the current set
- STORE_FOUR_VEC4
-
- @ load the next set of values
- LOAD_FOUR_VEC4
- subs r3, r3, #4
-
- @ calculate values for the next set
- MUL_MAT4x4_VEC4
-
- bgt .L_mainloop_mat4x4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_mat4x4:
- @ the last iteration for this call
- @ store the result for the last set
- STORE_FOUR_VEC4
-
-.L_check_mat4x4:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_mat4x4
-
-.L_secondloop_mat4x4:
- @ process the last few items left in the input array
- vld4.32 { d16[0], d18[0], d20[0], d22[0] }, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- MUL_MAT4x4_VEC4
-
- @ store the results
- vst4.32 { d24[0], d26[0], d28[0], d30[0] }, [r0]!
-
- bgt .L_secondloop_mat4x4
-
-.L_return_mat4x4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mulcmatvec_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_mulcmatvec_operation_x.h"
-
-extern arm_result_t mulcmatvec_cm4x4f_v4f_c (arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm4x4f_v4f_neon (arm_vec4f_t * dst, const arm_mat4x4f_t * cst, arm_vec4f_t * src, unsigned int count);
-
-extern arm_result_t mulcmatvec_cm3x3f_v3f_c (arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm3x3f_v3f_neon (arm_vec3f_t * dst, const arm_mat3x3f_t * cst, arm_vec3f_t * src, unsigned int count);
-
-extern arm_result_t mulcmatvec_cm2x2f_v2f_c (arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t mulcmatvec_cm2x2f_v2f_neon (arm_vec2f_t * dst, const arm_mat2x2f_t * cst, arm_vec2f_t * src, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) mulcmatvec_cm2x2f_v2f_c;
- ftbl[ 1] = (arm_func_4args_t) mulcmatvec_cm2x2f_v2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) mulcmatvec_cm2x2f_v2f_neon;
-
- ftbl[ 3] = (arm_func_4args_t) mulcmatvec_cm3x3f_v3f_c;
- ftbl[ 4] = (arm_func_4args_t) mulcmatvec_cm3x3f_v3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) mulcmatvec_cm3x3f_v3f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) mulcmatvec_cm4x4f_v4f_c;
- ftbl[ 7] = (arm_func_4args_t) mulcmatvec_cm4x4f_v4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) mulcmatvec_cm4x4f_v4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mulmat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addmat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t mulmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count)
-{
- #define A1 src1[ itr ].c1.r1
- #define A2 src2[ itr ].c1.r1
- #define B1 src1[ itr ].c1.r2
- #define B2 src2[ itr ].c1.r2
- #define C1 src1[ itr ].c2.r1
- #define C2 src2[ itr ].c2.r1
- #define D1 src1[ itr ].c2.r2
- #define D2 src2[ itr ].c2.r2
-
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = (A1*A2)+(C1*B2);
- dst[ itr ].c1.r2 = (B1*A2)+(D1*B2);
-
- dst[ itr ].c2.r1 = (A1*C2)+(C1*D2);
- dst[ itr ].c2.r2 = (B1*C2)+(D1*D2);
- );
-
- #undef A1
- #undef A2
- #undef B1
- #undef B2
- #undef C1
- #undef C2
- #undef D1
- #undef D2
-}
-
-arm_result_t mulmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count)
-{
- #define A1 src1[ itr ].c1.r1
- #define A2 src2[ itr ].c1.r1
- #define B1 src1[ itr ].c1.r2
- #define B2 src2[ itr ].c1.r2
- #define C1 src1[ itr ].c1.r3
- #define C2 src2[ itr ].c1.r3
- #define D1 src1[ itr ].c2.r1
- #define D2 src2[ itr ].c2.r1
- #define E1 src1[ itr ].c2.r2
- #define E2 src2[ itr ].c2.r2
- #define F1 src1[ itr ].c2.r3
- #define F2 src2[ itr ].c2.r3
- #define G1 src1[ itr ].c3.r1
- #define G2 src2[ itr ].c3.r1
- #define H1 src1[ itr ].c3.r2
- #define H2 src2[ itr ].c3.r2
- #define I1 src1[ itr ].c3.r3
- #define I2 src2[ itr ].c3.r3
-
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = (A1*A2)+(D1*B2)+(G1*C2);
- dst[ itr ].c1.r2 = (B1*A2)+(E1*B2)+(H1*C2);
- dst[ itr ].c1.r3 = (C1*A2)+(F1*B2)+(I1*C2);
-
- dst[ itr ].c2.r1 = (A1*D2)+(D1*E2)+(G1*F2);
- dst[ itr ].c2.r2 = (B1*D2)+(E1*E2)+(H1*F2);
- dst[ itr ].c2.r3 = (C1*D2)+(F1*E2)+(I1*F2);
-
- dst[ itr ].c3.r1 = (A1*G2)+(D1*H2)+(G1*I2);
- dst[ itr ].c3.r2 = (B1*G2)+(E1*H2)+(H1*I2);
- dst[ itr ].c3.r3 = (C1*G2)+(F1*H2)+(I1*I2);
- );
-
- #undef A1
- #undef A2
- #undef B1
- #undef B2
- #undef C1
- #undef C2
- #undef D1
- #undef D2
- #undef E1
- #undef E2
- #undef F1
- #undef F2
- #undef G1
- #undef G2
- #undef H1
- #undef H2
- #undef I1
- #undef I2
-}
-
-arm_result_t mulmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count)
-{
- #define A1 src1[ itr ].c1.r1
- #define A2 src2[ itr ].c1.r1
- #define B1 src1[ itr ].c1.r2
- #define B2 src2[ itr ].c1.r2
- #define C1 src1[ itr ].c1.r3
- #define C2 src2[ itr ].c1.r3
- #define D1 src1[ itr ].c1.r4
- #define D2 src2[ itr ].c1.r4
-
- #define E1 src1[ itr ].c2.r1
- #define E2 src2[ itr ].c2.r1
- #define F1 src1[ itr ].c2.r2
- #define F2 src2[ itr ].c2.r2
- #define G1 src1[ itr ].c2.r3
- #define G2 src2[ itr ].c2.r3
- #define H1 src1[ itr ].c2.r4
- #define H2 src2[ itr ].c2.r4
-
- #define I1 src1[ itr ].c3.r1
- #define I2 src2[ itr ].c3.r1
- #define J1 src1[ itr ].c3.r2
- #define J2 src2[ itr ].c3.r2
- #define K1 src1[ itr ].c3.r3
- #define K2 src2[ itr ].c3.r3
- #define L1 src1[ itr ].c3.r4
- #define L2 src2[ itr ].c3.r4
-
- #define M1 src1[ itr ].c4.r1
- #define M2 src2[ itr ].c4.r1
- #define N1 src1[ itr ].c4.r2
- #define N2 src2[ itr ].c4.r2
- #define O1 src1[ itr ].c4.r3
- #define O2 src2[ itr ].c4.r3
- #define P1 src1[ itr ].c4.r4
- #define P2 src2[ itr ].c4.r4
-
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = (A1*A2)+(E1*B2)+(I1*C2)+(M1*D2);
- dst[ itr ].c1.r2 = (B1*A2)+(F1*B2)+(J1*C2)+(N1*D2);
- dst[ itr ].c1.r3 = (C1*A2)+(G1*B2)+(K1*C2)+(O1*D2);
- dst[ itr ].c1.r4 = (D1*A2)+(H1*B2)+(L1*C2)+(P1*D2);
-
- dst[ itr ].c2.r1 = (A1*E2)+(E1*F2)+(I1*G2)+(M1*H2);
- dst[ itr ].c2.r2 = (B1*E2)+(F1*F2)+(J1*G2)+(N1*H2);
- dst[ itr ].c2.r3 = (C1*E2)+(G1*F2)+(K1*G2)+(O1*H2);
- dst[ itr ].c2.r4 = (D1*E2)+(H1*F2)+(L1*G2)+(P1*H2);
-
- dst[ itr ].c3.r1 = (A1*I2)+(E1*J2)+(I1*K2)+(M1*L2);
- dst[ itr ].c3.r2 = (B1*I2)+(F1*J2)+(J1*K2)+(N1*L2);
- dst[ itr ].c3.r3 = (C1*I2)+(G1*J2)+(K1*K2)+(O1*L2);
- dst[ itr ].c3.r4 = (D1*I2)+(H1*J2)+(L1*K2)+(P1*L2);
-
- dst[ itr ].c4.r1 = (A1*M2)+(E1*N2)+(I1*O2)+(M1*P2);
- dst[ itr ].c4.r2 = (B1*M2)+(F1*N2)+(J1*O2)+(N1*P2);
- dst[ itr ].c4.r3 = (C1*M2)+(G1*N2)+(K1*O2)+(O1*P2);
- dst[ itr ].c4.r4 = (D1*M2)+(H1*N2)+(L1*O2)+(P1*P2);
- );
-
- #undef A1
- #undef A2
- #undef B1
- #undef B2
- #undef C1
- #undef C2
- #undef D1
- #undef D2
- #undef E1
- #undef E2
- #undef F1
- #undef F2
- #undef G1
- #undef G2
- #undef H1
- #undef H2
- #undef I1
- #undef I2
- #undef J1
- #undef J2
- #undef K1
- #undef K2
- #undef L1
- #undef L2
- #undef M1
- #undef M2
- #undef N1
- #undef N2
- #undef O1
- #undef O2
- #undef P1
- #undef P2
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_mulmat.neon.s
-@
-
-
-
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .balign 4
- .global mulmat_2x2f_neon
- .thumb
- .thumb_func
-
-mulmat_2x2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulmat_2x2f(arm_mat2x2f_t * dst,
- @ arm_mat2x2f_t * src1,
- @ arm_mat2x2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_mat2x2
-
- @ We load four 2x2 matrices at a time, multiply them to
- @ get two resulting 2x2 matrices, store them in the destination
- @ and then move on to the next four matrices.
-
- @ load the 1st set of values
- vld4.32 { d0, d1, d2, d3 }, [r1]!
- vld4.32 { d4, d5, d6, d7 }, [r2]!
- subs r3, r3, #4 @ 2 for this set, and 2 for the 2nd set
-
- @ calculate values for the 1st set
- vmul.f32 d16, d0, d4
- vmul.f32 d17, d1, d4
- vmul.f32 d18, d0, d6
- vmul.f32 d19, d1, d6
-
- vmla.f32 d16, d2, d5
- vmla.f32 d17, d3, d5
- vmla.f32 d18, d2, d7
- vmla.f32 d19, d3, d7
-
-
- @ load the 2nd set of values
- vld4.32 { d0, d1, d2, d3 }, [r1]!
- vld4.32 { d4, d5, d6, d7 }, [r2]!
-
- ble .L_mainloopend_mat2x2
-
-.L_mainloop_mat2x2:
- @ store the result for the 1st/next (e.g. 3rd) set
- vst4.32 { d16, d17, d18, d19}, [r0]!
-
- @ calculate values for the 2nd/next (e.g. 3rd) set
- vmul.f32 d16, d0, d4
- vmul.f32 d17, d1, d4
- vmul.f32 d18, d0, d6
- vmul.f32 d19, d1, d6
-
- vmla.f32 d16, d2, d5
- vmla.f32 d17, d3, d5
- vmla.f32 d18, d2, d7
- vmla.f32 d19, d3, d7
-
- @ load the next (e.g. 3rd) set of values
- subs r3, r3, #2
- vld4.32 { d0, d1, d2, d3 }, [r1]!
- vld4.32 { d4, d5, d6, d7 }, [r2]!
-
-
- bgt .L_mainloop_mat2x2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_mat2x2:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- vst4.32 { d16, d17, d18, d19}, [r0]!
-
- @ calculate values for the last (e.g. 3rd) set
- vmul.f32 d16, d0, d4
- vmul.f32 d17, d1, d4
- vmul.f32 d18, d0, d6
- vmul.f32 d19, d1, d6
-
- vmla.f32 d16, d2, d5
- vmla.f32 d17, d3, d5
- vmla.f32 d18, d2, d7
- vmla.f32 d19, d3, d7
-
- @ store the result for the last (e.g. 3rd) set
- vst4.32 { d16, d17, d18, d19}, [r0]!
-
-
-.L_check_mat2x2:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_mat2x2
-
-.L_secondloop_mat2x2:
- @ process the last few items left in the input array
- vld4.32 { d0[0], d1[0], d2[0], d3[0] }, [r1]!
- vld4.32 { d4[0], d5[0], d6[0], d7[0] }, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- vmul.f32 d16, d0, d4
- vmul.f32 d17, d1, d4
- vmul.f32 d18, d0, d6
- vmul.f32 d19, d1, d6
-
- vmla.f32 d16, d2, d5
- vmla.f32 d17, d3, d5
- vmla.f32 d18, d2, d7
- vmla.f32 d19, d3, d7
-
- vst4.32 { d16[0], d17[0], d18[0], d19[0] }, [r0]!
-
- bgt .L_secondloop_mat2x2
-
-.L_return_mat2x2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro to load four 3x3 matrices, two from the first source which
- @ according to the function signatures is src1 (r1) and
- @ another two from the second source which is src2 (r2)
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_3x3MATS
-
- # load two 3x3 matrices from src1
- vld1.32 { q0-q1 }, [r1]!
- vld1.32 { d8[0] }, [r1]!
- vld1.32 { q2-q3 }, [r1]!
- vld1.32 { d8[1] }, [r1]!
-
- # load two 3x3 matrices from src2
- vld1.32 { q8-q9 }, [r2]!
- vld1.32 { d9[0] }, [r2]!
- vld1.32 { q10-q11 }, [r2]!
- vld1.32 { d9[1] }, [r2]!
-
-
- # rearrange them both
- vtrn.32 q0, q2
- vtrn.32 q1, q3
-
- vtrn.32 q8, q10
- vtrn.32 q9, q11
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro multiplies two pairs of 3x3 matrices that were
- @ loaded using the above LOAD_3x3MATS macro in registers q0-q11.
- @ The two resulting matrices are returned in q12, q13, q14, q15, & d9
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro MULTIPLY_3x3MATS
-
- @ a = d0 & d16
- @ b = d4 & d20
- @ c = d1 & d17
- @ d = d5 & d21
- @ e = d2 & d18
- @ f = d6 & d22
- @ g = d3 & d19
- @ h = d7 & d23
- @ i = d8 & d9
-
- vmul.f32 d24, d0, d16
- vmul.f32 d28, d4, d16
- vmul.f32 d25, d1, d16
- vmul.f32 d29, d0, d21
- vmul.f32 d26, d4, d21
- vmul.f32 d30, d1, d21
- vmul.f32 d27, d0, d19
- vmul.f32 d31, d4, d19
- vmul.f32 d10, d1, d19
-
- vmla.f32 d24, d5, d20
- vmla.f32 d28, d2, d20
- vmla.f32 d25, d6, d20
- vmla.f32 d29, d5, d18
- vmla.f32 d26, d2, d18
- vmla.f32 d30, d6, d18
- vmla.f32 d27, d5, d23
- vmla.f32 d31, d2, d23
- vmla.f32 d10, d6, d23
-
- vmla.f32 d24, d3, d17
- vmla.f32 d28, d7, d17
- vmla.f32 d25, d8, d17
- vmla.f32 d29, d3, d22
- vmla.f32 d26, d7, d22
- vmla.f32 d30, d8, d22
- vmla.f32 d27, d3, d9
- vmla.f32 d31, d7, d9
- vmla.f32 d10, d8, d9
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro to store the two resulting 3x3 matrices from
- @ the above MULTIPLY_3x3MATS macro (q12-q15, & d9 are stored)
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_3x3MATS
-
- # rearrange them both
- vtrn.32 q12, q14
- vtrn.32 q13, q15
-
- # store two 3x3 matrices to dst
- vst1.32 { q12-q13 }, [r0]!
- vst1.32 { d10[0] }, [r0]!
- vst1.32 { q14-q15 }, [r0]!
- vst1.32 { d10[1] }, [r0]!
-
- .endm
-
-
-
-
- .align 2
- .global mulmat_3x3f_neon
- .thumb
- .thumb_func
-mulmat_3x3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulmat_3x3f(arm_mat3x3f_t * dst,
- @ arm_mat3x3f_t * src1,
- @ arm_mat3x3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push { r4 }
- vpush { d8, d9, d10 }
- and r4, r3, #3 @ r3 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_mat3x3
-
- @ load the 1st set of values
- LOAD_3x3MATS
- subs r3, r3, #4 @ 2 for this set, and 2 for the 2nd set
-
- @ calculate values for the 1st set
- MULTIPLY_3x3MATS
-
- @ load the 2nd set of values
- LOAD_3x3MATS
- ble .L_mainloopend_mat3x3
-
-.L_mainloop_mat3x3:
- @ store the result for the 1st/next (e.g. 3rd) set
- STORE_3x3MATS
-
- @ calculate values for the 2nd/next (e.g. 3rd) set
- MULTIPLY_3x3MATS
-
- @ load the next (e.g. 3rd) set of values
- LOAD_3x3MATS
-
- subs r3, r3, #2
-
- bgt .L_mainloop_mat3x3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_mat3x3:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- STORE_3x3MATS
-
- @ calculate values for the last (e.g. 3rd) set
- MULTIPLY_3x3MATS
-
- @ store the result for the last (e.g. 3rd) set
- STORE_3x3MATS
-
-.L_check_mat3x3:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_mat3x3
-
-.L_secondloop_mat3x3:
- @ process the last few items left in the input array
- @ load the next (e.g. 3rd) set of values
- vld1.32 { q0-q1 }, [r1]!
- vld1.32 { d8[0] }, [r1]!
- vld1.32 { q8-q9 }, [r2]!
- vld1.32 { d9[0] }, [r2]!
-
- vtrn.32 q0, q2
- vtrn.32 q1, q3
-
- vtrn.32 q8, q10
- vtrn.32 q9, q11
-
- subs r4, r4, #1
-
- @ calculate values for the last (e.g. 3rd) set
- MULTIPLY_3x3MATS
-
- @ store the result for the last (e.g. 3rd) set
- vtrn.32 q12, q14
- vtrn.32 q13, q15
-
- vst1.32 { q12-q13 }, [r0]!
- vst1.32 { d10[0] }, [r0]!
-
-
- bgt .L_secondloop_mat3x3
-
-.L_return_mat3x3:
- @ return
- vpop { d8, d9, d10 }
- pop { r4 }
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ A macro to load a pair of 4x4 matrices from src1 (r1) and
- @ src2 (r2) into registers q0-q3 & q8-q11.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro LOAD_4x4MATS
-
- # load a 4x4 matrix from src1
- vld1.32 { q8-q9 }, [r1]!
- vld1.32 {q10-q11}, [r1]!
-
- # load a 4x4 matrix from src2
- vld1.32 {q0-q1}, [r2]!
- vld1.32 {q2-q3}, [r2]!
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro multiplies the two 4x4 matrices loaded in the
- @ above LOAD_4x4MATS macro and returns the resulting 4x4
- @ matrix in q12-q15.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro MULTIPLY_4x4MATS
-
- vmul.f32 q12, q8, d0[0]
- vmul.f32 q13, q8, d2[0]
- vmul.f32 q14, q8, d4[0]
- vmul.f32 q15, q8, d6[0]
-
- vmla.f32 q12, q9, d0[1]
- vmla.f32 q13, q9, d2[1]
- vmla.f32 q14, q9, d4[1]
- vmla.f32 q15, q9, d6[1]
-
-
- vmla.f32 q12, q10, d1[0]
- vmla.f32 q13, q10, d3[0]
- vmla.f32 q14, q10, d5[0]
- vmla.f32 q15, q10, d7[0]
-
- vmla.f32 q12, q11, d1[1]
- vmla.f32 q13, q11, d3[1]
- vmla.f32 q14, q11, d5[1]
- vmla.f32 q15, q11, d7[1]
-
- .endm
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro stores the resulting 4x4 matrix which is
- @ returned by the above MULTIPLY_4x4MATS macro from registers
- @ q12-q15 into the dst (r0).
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_4x4MATS
-
- # store two 3x3 matrices to dst
- vst1.32 { q12-q13 }, [r0]!
- vst1.32 { q14-q15 }, [r0]!
-
- .endm
-
-
-
-
- .align 2
- .global mulmat_4x4f_neon
- .thumb
- .thumb_func
-mulmat_4x4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t mulmat_4x4f(arm_mat4x4f_t * dst,
- @ arm_mat4x4f_t * src1,
- @ arm_mat4x4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_mat4x4
-
- @ load the 1st set of values
- LOAD_4x4MATS
-
- subs r3, r3, #2
-
- @ calculate values for the 1st set
- MULTIPLY_4x4MATS
-
- @ load the 2nd set of values
- LOAD_4x4MATS
-
- ble .L_mainloopend_mat4x4
-
-.L_mainloop_mat4x4:
- @ store the result for the 1st/next (e.g. 3rd) set
- STORE_4x4MATS
-
- @ calculate values for the 2nd/next (e.g. 3rd) set
- MULTIPLY_4x4MATS
-
- @ load the next (e.g. 3rd) set of values
- subs r3, r3, #1
- LOAD_4x4MATS
-
- bgt .L_mainloop_mat4x4 @ loop if r2 is > r3, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_mat4x4:
- @ the last iteration for this call
- @ store the result for the set of values before the last one (e.g 2nd set)
- STORE_4x4MATS
-
- @ calculate values for the last (e.g. 3rd) set
- MULTIPLY_4x4MATS
-
- @ store the result for the last (e.g. 3rd) set
- STORE_4x4MATS
-
-.L_check_mat4x4:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_mat4x4
-
-.L_secondloop_mat4x4:
- @ process the last few items left in the input array
- LOAD_4x4MATS
-
- subs r4, r4, #1
-
- @ calculate values
- MULTIPLY_4x4MATS
-
- @ store the results
- STORE_4x4MATS
-
- bgt .L_secondloop_mat4x4
-
-.L_return_mat4x4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_mulmat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xmat_operation_x.h"
-
-extern arm_result_t mulmat_2x2f_c (arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t mulmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-
-extern arm_result_t mulmat_3x3f_c (arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t mulmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-
-extern arm_result_t mulmat_4x4f_c (arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t mulmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) mulmat_2x2f_c;
- ftbl[ 1] = (arm_func_4args_t) mulmat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) mulmat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_4args_t) mulmat_3x3f_c;
- ftbl[ 4] = (arm_func_4args_t) mulmat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) mulmat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) mulmat_4x4f_c;
- ftbl[ 7] = (arm_func_4args_t) mulmat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) mulmat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_normalize.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global normalize_vec2f_asm
- .thumb
- .thumb_func
-
-normalize_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t normalize_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, unsigned int count)
- @
- @ r0: *dst and current destination item's address
- @ r1: *src and current source item's address
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndVec2F
- add r0, r0, r2, lsl #3 @ r0 = r0 + count * 8
- add r1, r1, r2, lsl #3 @ r1 = r1 + count * 8
-
-.LoopBeginVec2F:
- vldmdb r1!, {s10-s11} @ load s10 = x and S11 = y
- vmul.f32 s14, s10, s10 @ s14 = x*x
- vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
- vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
- vdiv.f32 s10, s10, s15 @ s10 = x / length
- vdiv.f32 s11, s11, s15 @ s11 = y / length
- vstmdb r0!, {s10-s11} @ store the results and move the pointer
- subs r2, r2, #1 @ decrement the loop counter
- bne .LoopBeginVec2F @ loop if r4 is still positive or zero
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
-
-
-
-
- .balign 4
- .global normalize_vec3f_asm
- .thumb
- .thumb_func
-
-normalize_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t normalize_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, unsigned int count)
- @
- @ r0: *dst and current destination item's address
- @ r1: *src and current source item's address
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndVec3F
- add r0, r0, r2, lsl #3 @ ...
- add r0, r0, r2, lsl #2 @ r0 = r0 + count * 12
- add r1, r1, r2, lsl #3 @ ...
- add r1, r1, r2, lsl #2 @ r1 = r1 + count * 12
-
-.LoopBeginVec3F:
- vldmdb r1!, {s10-s12}
- vmul.f32 s14, s10, s10 @ s14 = x*x
- vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
- vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
- vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
- vdiv.f32 s10, s10, s15 @ s10 = x / length
- vdiv.f32 s11, s11, s15 @ s11 = y / length
- vdiv.f32 s12, s12, s15 @ s12 = z / length
- vstmdb r0!, {s10-s12} @ store the results and move the pointer
- subs r2, r2, #1 @ decrement the loop counter
- bne .LoopBeginVec3F @ loop if r4 is still positive or zero
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
-
-
-
-
- .balign 4
- .global normalize_vec4f_asm
- .thumb
- .thumb_func
-
-normalize_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t normalize_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, unsigned int count)
- @
- @ r0: *dst and current destination item's address
- @ r1: *src and current source item's address
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndVec4F
- add r0, r0, r2, lsl #4 @ r0 = r0 + count * 16
- add r1, r1, r2, lsl #4 @ r1 = r1 + count * 16
-
-.LoopBeginVec4F:
- vldmdb r1!, {s10-s13}
- vmul.f32 s14, s10, s10 @ s14 = x*x
- vmla.f32 s14, s11, s11 @ s14 = x*x + y*y
- vmla.f32 s14, s12, s12 @ s14 = x*x + y*y + z*z
- vmla.f32 s14, s13, s13 @ s14 = x*x + y*y + z*z + w*w
- vsqrt.f32 s15, s14 @ s15 = sqrt( s14 )
- vdiv.f32 s10, s10, s15 @ s10 = x / length
- vdiv.f32 s11, s11, s15 @ s11 = y / length
- vdiv.f32 s12, s12, s15 @ s12 = z / length
- vdiv.f32 s13, s13, s15 @ s12 = w / length
- vstmdb r0!, {s10-s13} @ store the results and move the pointer
- subs r2, r2, #1 @ decrement the loop counter
- bne .LoopBeginVec4F @ loop if r4 is still positive or zero
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_normalize.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-#include <math.h>
-
-arm_result_t normalize_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count)
-{
- float len;
-
- NE10_LEN_OPERATION_X_C
- (
- len = sqrt( src[ itr ].x * src[ itr ].x +
- src[ itr ].y * src[ itr ].y ) ;
-
- dst[ itr ].x = src[ itr ].x / len;
- dst[ itr ].y = src[ itr ].y / len;
- );
-}
-
-arm_result_t normalize_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count)
-{
- float len;
-
- NE10_LEN_OPERATION_X_C
- (
- len = sqrt( src[ itr ].x * src[ itr ].x +
- src[ itr ].y * src[ itr ].y +
- src[ itr ].z * src[ itr ].z );
-
- dst[ itr ].x = src[ itr ].x / len;
- dst[ itr ].y = src[ itr ].y / len;
- dst[ itr ].z = src[ itr ].z / len;
- );
-}
-
-arm_result_t normalize_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count)
-{
- float len;
-
- NE10_LEN_OPERATION_X_C
- (
- len = sqrt( src[ itr ].x * src[ itr ].x +
- src[ itr ].y * src[ itr ].y +
- src[ itr ].z * src[ itr ].z +
- src[ itr ].w * src[ itr ].w );
-
- dst[ itr ].x = src[ itr ].x / len;
- dst[ itr ].y = src[ itr ].y / len;
- dst[ itr ].z = src[ itr ].z / len;
- dst[ itr ].w = src[ itr ].w / len;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_normalize.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .balign 4
- .global normalize_vec2f_neon
- .thumb
- .thumb_func
-
-normalize_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t normalize_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cbz r2, .L_check_vec2
-
- @ load values for the first iteration
- vld2.32 {q0-q1}, [r1]!
- subs r2, r2, #4
-
- @ calculate sum of square of the components
- vmul.f32 q2, q0, q0
- vmla.f32 q2, q1, q1
-
- ble .L_mainloopend_vec2
-
-.L_mainloop_vec2:
- @ load the next set of values
- vmov.f32 q10, q0
- vmov.f32 q11, q1
- vld2.32 {q0-q1}, [r1]!
- subs r2, r2, #4
-
- @ get reciprocal SQRT of the last vector while loading a new vector
- vrsqrte.f32 q3, q2
- vmul.f32 q4, q2, q3
- vrsqrts.f32 q4, q4, q3
- vmul.f32 q4, q3, q4
-
- @ normalize the components
- vmul.f32 q3, q10, q4 @ q3 = q0(8) * q4
- vmul.f32 q4, q11, q4 @ q4 = q1(9) * q4
-
- vst2.32 {d6,d7,d8,d9}, [r0]!
-
- @ calculate sum of square of the components
- vmul.f32 q2, q0, q0
- vmla.f32 q2, q1, q1
-
- bgt .L_mainloop_vec2 @ loop if r2 is > r3, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_vec2:
- @ the last iteration for this call
- @ get reciprocal SQRT of the last vector
- vrsqrte.f32 q3, q2
- vmul.f32 q4, q2, q3
- vrsqrts.f32 q4, q4, q3
- vmul.f32 q4, q3, q4
-
- @ normalize the components
- vmul.f32 q3, q0, q4 @ q3 = q0 * q4
- vmul.f32 q4, q1, q4 @ q4 = q1 * q4
-
- vst2.32 {d6,d7,d8,d9}, [r0]!
-
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec2
-
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r1]! @ Fill in d0 = { V.x, V.y };
-
- subs r3, r3, #1
-
- @ calculate sum of square of the components
- vmul.f32 d1, d0, d0 @ d1= { V.x^2, V.y^2 };
- vpadd.f32 d3, d1, d1 @ d3= { V.x^2 + (V.y^2), V.y^2 + (V.x^2) };
-
-
- @ get reciprocal SQRT of the last vector
- vrsqrte.f32 d2, d3
- vmul.f32 d1, d3, d2
- vrsqrts.f32 d1, d1, d2
- vmul.f32 d1, d2, d1
-
- @ normalize the components
- vmul.f32 d0, d0, d1
-
- vst1.32 {d0}, [r0]!
-
- bgt .L_secondloop_vec2
-
-.L_return_vec2:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global normalize_vec3f_neon
- .thumb
- .thumb_func
-normalize_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t normalize_vec3f(arm_vec3t_t * dst,
- @ arm_vec3f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_vec3
-
- @ load values for the first iteration
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- subs r2, r2, #4
-
- @ calculate sum of square of the components
- vmul.f32 q3, q0, q0
- vmla.f32 q3, q1, q1
- vmla.f32 q3, q2, q2
-
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ load the next set of values
- vmov.f32 q10, q0
- vmov.f32 q11, q1
- vmov.f32 q12, q2
-
- vld3.32 {d0,d2,d4}, [r1]!
- vld3.32 {d1,d3,d5}, [r1]!
- subs r2, r2, #4
-
- @ get reciprocal SQRT of the last vector while loading a new vector
- vrsqrte.f32 q5, q3
- vmul.f32 q4, q3, q5
- vrsqrts.f32 q4, q4, q5
- vmul.f32 q4, q5, q4
-
- @ normalize the components
- vmul.f32 q5, q10, q4
- vmul.f32 q6, q11, q4
- vmul.f32 q7, q12, q4
-
- vst3.32 {d10, d12, d14}, [r0]!
- vst3.32 {d11, d13, d15}, [r0]!
-
- @ calculate sum of square of the components
- vmul.f32 q3, q0, q0
- vmla.f32 q3, q1, q1
- vmla.f32 q3, q2, q2
-
- bgt .L_mainloop_vec3 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
- @ get reciprocal SQRT of the last vector
- vrsqrte.f32 q5, q3
- vmul.f32 q4, q3, q5
- vrsqrts.f32 q4, q4, q5
- vmul.f32 q4, q5, q4
-
- @ normalize the components
- vmul.f32 q5, q0, q4
- vmul.f32 q6, q1, q4
- vmul.f32 q7, q2, q4
-
- vst3.32 {d10, d12, d14}, [r0]!
- vst3.32 {d11, d13, d15}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, -, -, - };
- @ q1 = { V.y, -, -, - };
- @ q2 = { V.z, -, -, - };
- subs r3, r3, #1
-
- @ calculate sum of square of the components
- vmul.f32 q3, q0, q0 @ V.x^2
- vmla.f32 q3, q1, q1 @ V.x^2 + V.y^2
- vmla.f32 q3, q2, q2 @ V.x^2 + V.y^2 + V.z^2
-
-
- @ get reciprocal SQRT of the last vector
- vrsqrte.f32 q5, q3
- vmul.f32 q4, q3, q5
- vrsqrts.f32 q4, q4, q5
- vmul.f32 q4, q5, q4
-
- @ normalize the components
- vmul.f32 q0, q0, q4
- vmul.f32 q1, q1, q4
- vmul.f32 q2, q2, q4
-
- vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
-
- bgt .L_secondloop_vec3
-
-.L_return_vec3:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- .align 2
- .global normalize_vec4f_neon
- .thumb
- .thumb_func
-normalize_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t normalize_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src,
- @ unsigned int count);
- @
- @ r0: *dst & the current dst entry's address
- @ r1: *src & current src entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @ r3: the number of items that are left to be processed at the end of
- @ the input array
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_vec4
-
- @ load values for the first iteration
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- @ calculate sum of square of the components
- vmul.f32 q5, q0, q0
- vmla.f32 q5, q1, q1
- vmla.f32 q5, q2, q2
- vmla.f32 q5, q3, q3
-
- ble .L_mainloopend_vec4
-
-.L_mainloop_vec4:
- @ load the next set of values
- vmov q10, q0
- vmov q11, q1
- vmov q12, q2
- vmov q13, q3
-
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- subs r2, r2, #4
-
- @ get reciprocal SQRT of the last vector while loading a new vector
- vrsqrte.f32 q6, q5
- vmul.f32 q4, q5, q6
- vrsqrts.f32 q4, q4, q6
- vmul.f32 q4, q6, q4
-
- @ normalize the components
- vmul.f32 q10, q10, q4
- vmul.f32 q11, q11, q4
- vmul.f32 q12, q12, q4
- vmul.f32 q13, q13, q4
-
- vst4.32 {d20, d22, d24, d26}, [r0]!
- vst4.32 {d21, d23, d25, d27}, [r0]!
-
- @ calculate sum of square of the components
- vmul.f32 q5, q0, q0
- vmla.f32 q5, q1, q1
- vmla.f32 q5, q2, q2
- vmla.f32 q5, q3, q3
-
- bgt .L_mainloop_vec4 @ loop if r2 is > r3, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
- @ get reciprocal SQRT of the last vector
- vrsqrte.f32 q6, q5
- vmul.f32 q4, q5, q6
- vrsqrts.f32 q4, q4, q6
- vmul.f32 q4, q6, q4
-
- @ normalize the components
- vmul.f32 q0, q0, q4
- vmul.f32 q1, q1, q4
- vmul.f32 q2, q2, q4
- vmul.f32 q3, q3, q4
-
- vst4.32 {d0, d2, d4, d6}, [r0]!
- vst4.32 {d1, d3, d5, d7}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V.x, -, -, - };
- @ q1 = { V.y, -, -, - };
- @ q2 = { V.z, -, -, - };
- subs r3, r3, #1
-
- @ calculate sum of square of the components
- vmul.f32 q4, q0, q0 @ V.x^2
- vmla.f32 q4, q1, q1 @ V.x^2 + V.y^2
- vmla.f32 q4, q2, q2 @ V.x^2 + V.y^2 + V.z^2
- vmla.f32 q4, q3, q3 @ V.x^2 + V.y^2 + V.z^2 + V.w^2
-
- @ get reciprocal SQRT of the last vector
- vrsqrte.f32 q5, q4
- vmul.f32 q6, q4, q5
- vrsqrts.f32 q6, q6, q5
- vmul.f32 q6, q5, q6
-
- @ normalize the components
- vmul.f32 q0, q0, q6
- vmul.f32 q1, q1, q6
- vmul.f32 q2, q2, q6
- vmul.f32 q3, q3, q6
-
- vst4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]! @ The values are loaded like so:
-
- bgt .L_secondloop_vec4
-
-.L_return_vec4:
- @ return
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_normalize_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_normalize_operation_x.h"
-
-
-extern arm_result_t normalize_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t normalize_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-extern arm_result_t normalize_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count);
-
-extern arm_result_t normalize_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t normalize_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-extern arm_result_t normalize_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count);
-
-extern arm_result_t normalize_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t normalize_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-extern arm_result_t normalize_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) normalize_vec2f_c;
- ftbl[ 1] = (arm_func_3args_t) normalize_vec2f_asm;
- ftbl[ 2] = (arm_func_3args_t) normalize_vec2f_neon;
-
- ftbl[ 3] = (arm_func_3args_t) normalize_vec3f_c;
- ftbl[ 4] = (arm_func_3args_t) normalize_vec3f_asm;
- ftbl[ 5] = (arm_func_3args_t) normalize_vec3f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) normalize_vec4f_c;
- ftbl[ 7] = (arm_func_3args_t) normalize_vec4f_asm;
- ftbl[ 8] = (arm_func_3args_t) normalize_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_rsbc.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global rsbc_float_asm
- .thumb
- .thumb_func
-
-rsbc_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t rsbc_float(arm_vec2f_t * dst,
- @ arm_float_t * src, const arm_float_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndFloat
- mov r5, #0
-
-.LoopBeginFloat:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i]
- vmov s3, r2 @ Get cst into register s3
- vsub.f32 s10, s3, s1 @ s10 = cst - src[i]
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the result back into the main memory
- add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global rsbc_vec2f_asm
- .thumb
- .thumb_func
-
-rsbc_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t rsbc_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec2F
- mov r5, #0
-
-.LoopBeginVec2F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x and src[i].y
- vldr s2, [r6, #4]
- vldr s3, [r2, #0] @ Load cst->x and cst->y
- vldr s4, [r2, #4]
- vsub.f32 s10, s3, s1 @ s10 = cst->x - src[i].x
- vsub.f32 s11, s4, s2 @ s11 = cst->y - src[i].y
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global rsbc_vec3f_asm
- .thumb
- .thumb_func
-
-rsbc_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t rsbc_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec3F
- mov r5, #0
-
-.LoopBeginVec3F:
-
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
- vldr s5, [r2, #4]
- vldr s6, [r2, #8]
- vsub.f32 s10, s4, s1 @ s10 = cst->x - src[i].x
- vsub.f32 s11, s5, s2 @ s11 = cst->y - src[i].y
- vsub.f32 s12, s6, s3 @ s12 = cst->z - src[i].z
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global rsbc_vec4f_asm
- .thumb
- .thumb_func
-
-rsbc_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t rsbc_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec4F
- mov r5, #0
-
-.LoopBeginVec4F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r6, #12]
- vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
- vldr s6, [r2, #4]
- vldr s7, [r2, #8]
- vldr s8, [r2, #12]
- vsub.f32 s10, s5, s1 @ s10 = cst->x - src[i].x
- vsub.f32 s11, s6, s2 @ s11 = cst->y - src[i].y
- vsub.f32 s12, s7, s3 @ s12 = cst->z - src[i].z
- vsub.f32 s13, s8, s4 @ s13 = cst->w - src[i].w
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- vstr s13, [r7, #12]
- add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_rsbc.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t rsbc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ] = cst - src[ itr ];
- );
-}
-
-arm_result_t rsbc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = cst->x - src[ itr ].x;
- dst[ itr ].y = cst->y - src[ itr ].y;
- );
-}
-
-arm_result_t rsbc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = cst->x - src[ itr ].x;
- dst[ itr ].y = cst->y - src[ itr ].y;
- dst[ itr ].z = cst->z - src[ itr ].z;
- );
-}
-
-arm_result_t rsbc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = cst->x - src[ itr ].x;
- dst[ itr ].y = cst->y - src[ itr ].y;
- dst[ itr ].z = cst->z - src[ itr ].z;
- dst[ itr ].w = cst->w - src[ itr ].w;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_rsbc.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
-
-arm_result_t rsbc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_FLOAT_NEON
- (
- n_dst = vsubq_f32( n_cst, n_src );
- ,
- n_tmp_src = vsub_f32( n_tmp_cst, n_tmp_src );
- );
-}
-
-arm_result_t rsbc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC2F_NEON
- (
- n_dst = vsubq_f32( n_cst, n_src );
- ,
- n_tmp_src = vsub_f32( n_tmp_cst, n_tmp_src );
- );
-}
-
-arm_result_t rsbc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC3F_NEON
- (
- n_dst1 = vsubq_f32( n_cst1, n_src1 );
- n_dst2 = vsubq_f32( n_cst2, n_src2 );
- n_dst3 = vsubq_f32( n_cst3, n_src3 );
- ,
- n_tmp_src.val[0] = vsub_f32( n_tmp_cst.val[0], n_tmp_src.val[0] );
- n_tmp_src.val[1] = vsub_f32( n_tmp_cst.val[1], n_tmp_src.val[1] );
- n_tmp_src.val[2] = vsub_f32( n_tmp_cst.val[2], n_tmp_src.val[2] );
- );
-}
-
-arm_result_t rsbc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC4F_NEON
- (
- n_dst = vsubq_f32( n_cst, n_src );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_rsbc_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xc_operation_x.h"
-
-
-extern arm_result_t rsbc_float_c (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t rsbc_float_asm (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t rsbc_float_neon (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t rsbc_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t rsbc_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t rsbc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-
-extern arm_result_t rsbc_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t rsbc_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t rsbc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-
-extern arm_result_t rsbc_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-extern arm_result_t rsbc_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-extern arm_result_t rsbc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) rsbc_float_c;
- ftbl[ 1] = (arm_func_4args_t) rsbc_float_asm;
- ftbl[ 2] = (arm_func_4args_t) rsbc_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) rsbc_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) rsbc_vec2f_asm;
- ftbl[ 5] = (arm_func_4args_t) rsbc_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) rsbc_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) rsbc_vec3f_asm;
- ftbl[ 8] = (arm_func_4args_t) rsbc_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) rsbc_vec4f_c;
- ftbl[10] = (arm_func_4args_t) rsbc_vec4f_asm;
- ftbl[11] = (arm_func_4args_t) rsbc_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_setc.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global setc_float_asm
- .thumb
- .thumb_func
-
-setc_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t setc_float(arm_float_t * dst,
- @ const arm_float_t cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: cst
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r2, .LoopEndFloat
-
-.LoopBeginFloat:
- str r1, [r0], #4 @ Store it back into the main memory
- subs r2, r2, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
-
-
-
-
- .balign 4
- .global setc_vec2f_asm
- .thumb
- .thumb_func
-
-setc_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t setc_vec2f(arm_vec2f_t * dst,
- @ const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *cst
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5}
- cbz r2, .LoopEndVec2F
- ldr r4, [r1, #0] @ Load cst->x into r4
- ldr r5, [r1, #4] @ Load cst->y into r5
-
-.LoopBeginVec2F:
- str r4, [r0], #4 @ Store them in the destination
- str r5, [r0], #4
- subs r2, r2, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5}
- bx lr
-
-
-
-
- .balign 4
- .global setc_vec3f_asm
- .thumb
- .thumb_func
-
-setc_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t setc_vec3f(arm_vec3f_t * dst,
- @ const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *cst
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6}
- cbz r2, .LoopEndVec3F
- ldr r4, [r1, #0] @ Load cst->x into r4
- ldr r5, [r1, #4] @ Load cst->y into r5
- ldr r6, [r1, #8] @ r6 = cst->z
-
-.LoopBeginVec3F:
- str r4, [r0], #4 @ Store them in the destination
- str r5, [r0], #4
- str r6, [r0], #4
- subs r2, r2, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6}
- bx lr
-
-
-
-
- .balign 4
- .global setc_vec4f_asm
- .thumb
- .thumb_func
-
-setc_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t setc_vec4f(arm_vec4f_t * dst,
- @ const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *cst
- @ r2: int count
- @
- @ r2: loop counter
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r2, .LoopEndVec4F
- ldr r4, [r1, #0] @ Load cst->x into r4
- ldr r5, [r1, #4] @ Load cst->y into r5
- ldr r6, [r1, #8] @ r6 = cst->z
- ldr r7, [r1, #12] @ r7 = cst->w
-
-.LoopBeginVec4F:
- str r4, [r0], #4 @ Store them in the destination
- str r5, [r0], #4
- str r6, [r0], #4
- str r7, [r0], #4
- subs r2, r2, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_setc.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t setc_float_c(arm_float_t * dst, const arm_float_t cst, unsigned int count)
-{
- NE10_SETC_OPERATION_X_C
- (
- dst[itr] = cst;
- );
-}
-
-arm_result_t setc_vec2f_c(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_SETC_OPERATION_X_C
- (
- dst[itr].x = cst->x;
- dst[itr].y = cst->y;
- );
-}
-
-arm_result_t setc_vec3f_c(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_SETC_OPERATION_X_C
- (
- dst[itr].x = cst->x;
- dst[itr].y = cst->y;
- dst[itr].z = cst->z;
- );
-}
-
-arm_result_t setc_vec4f_c(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_SETC_OPERATION_X_C
- (
- dst[itr].x = cst->x;
- dst[itr].y = cst->y;
- dst[itr].z = cst->z;
- dst[itr].w = cst->w;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_setc.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
-
-arm_result_t setc_float_neon(arm_float_t * dst, const arm_float_t cst, unsigned int count)
-{
- NE10_SETC_OPERATION_FLOAT_NEON
- (
- ;// The cst need not be altered
- ,
- ;// n_tmp_cst need not be altered
- );
-}
-
-arm_result_t setc_vec2f_neon(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_SETC_OPERATION_VEC2F_NEON
- (
- ;// The cst need not be altered
- ,
- ;// n_tmp_cst need not be altered
- );
-}
-
-arm_result_t setc_vec3f_neon(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_SETC_OPERATION_VEC3F_NEON
- (
- ;// cst1, cst2, and cst3 need not be altered
- ,
- ;// n_tmp_cst.val[0], .val[1], and .val[2] need not be altered
- );
-}
-
-arm_result_t setc_vec4f_neon(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_SETC_OPERATION_VEC4F_NEON
- (
- ;// n_cst need not be altered
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_setc_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_setc_operation_x.h"
-
-
-extern arm_result_t setc_float_c (arm_float_t * dst, const arm_float_t cst, unsigned int count);
-extern arm_result_t setc_float_asm (arm_float_t * dst, const arm_float_t cst, unsigned int count);
-extern arm_result_t setc_float_neon (arm_float_t * dst, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t setc_vec2f_c (arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t setc_vec2f_asm (arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t setc_vec2f_neon(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count);
-
-extern arm_result_t setc_vec3f_c (arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t setc_vec3f_asm (arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t setc_vec3f_neon(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count);
-
-extern arm_result_t setc_vec4f_c (arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-extern arm_result_t setc_vec4f_asm (arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-extern arm_result_t setc_vec4f_neon(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count);
-
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) setc_float_c;
- ftbl[ 1] = (arm_func_3args_t) setc_float_asm;
- ftbl[ 2] = (arm_func_3args_t) setc_float_neon;
-
- ftbl[ 3] = (arm_func_3args_t) setc_vec2f_c;
- ftbl[ 4] = (arm_func_3args_t) setc_vec2f_asm;
- ftbl[ 5] = (arm_func_3args_t) setc_vec2f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) setc_vec3f_c;
- ftbl[ 7] = (arm_func_3args_t) setc_vec3f_asm;
- ftbl[ 8] = (arm_func_3args_t) setc_vec3f_neon;
-
- ftbl[ 9] = (arm_func_3args_t) setc_vec4f_c;
- ftbl[10] = (arm_func_3args_t) setc_vec4f_asm;
- ftbl[11] = (arm_func_3args_t) setc_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_sub.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global sub_float_asm
- .thumb
- .thumb_func
-
-sub_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t sub_float(arm_vec2f_t * dst,
- @ arm_float_t * src1, const arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current src1 entry's address - made of base(r0)+offset(r5)
- @ r1: *src1 & current src1 entry's address - made of base(r1)+offset(r5)
- @ r2: *src2 & current src2 entry's address - made of base(r2)+offset(r5)
- @ r3: int count
- @
- @ r3: loop counter
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- cbz r3, .LoopEndFloat
-
-.LoopBeginFloat:
- vldr s1, [r1] @ Load s1 = src1[i]
- add r1, r1, #4 @ move to the next entry
- vldr s2, [r2] @ Load s2 = src2[i]
- add r2, r2, #4 @ next entry
- vsub.f32 s10, s1, s2 @ s10 = src1[i] - src2[i]
- vstr s10, [r0] @ Store the result back into the main memory
- add r0, r0, #4 @ next entry in the dst
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_sub.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t sub_float_c(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ] = src1[ itr ] - src2[ itr ];
- );
-}
-
-arm_result_t sub_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x - src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y - src2[ itr ].y;
- );
-}
-
-arm_result_t sub_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x - src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y - src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z - src2[ itr ].z;
- );
-}
-
-arm_result_t sub_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].x = src1[ itr ].x - src2[ itr ].x;
- dst[ itr ].y = src1[ itr ].y - src2[ itr ].y;
- dst[ itr ].z = src1[ itr ].z - src2[ itr ].z;
- dst[ itr ].w = src1[ itr ].w - src2[ itr ].w;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_sub.neon.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
-
-
-
- .align 4
- .global sub_float_neon
- .thumb
- .thumb_func
-
-sub_float_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t sub_float(arm_float_t * dst,
- @ arm_float_t * src1,
- @ arm_float_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
-
- cbz r3, .L_check_float
-
- @ load the 1st set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- vsub.f32 q3, q0, q1 @ q3 = q0 - q1
-
- ble .L_mainloopend_float
-
-.L_mainloop_float:
- @ store the result for the current set
- vst1.32 {d6,d7}, [r0]!
-
- @ load the next set of values
- vld1.32 {q0}, [r1]!
- vld1.32 {q1}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vsub.f32 q3, q0, q1 @ q3 = q0 - q1
-
- bgt .L_mainloop_float @ loop if r3 > 0, if we have at least another 4 floats
-
-.L_mainloopend_float:
- @ the last iteration for this call
- @ store the result for the last set
- vst1.32 {d6,d7}, [r0]!
-
-
-.L_check_float:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_float
-
-.L_secondloop_float:
- @ process the last few items left in the input array
- vld1.f32 d0[0], [r1]! @ Fill in d0[0]
- vld1.f32 d1[0], [r2]! @ Fill in d1[1]
-
-
- subs r4, r4, #1
-
- @ values
- vsub.f32 d0, d0, d1
-
- vst1.32 {d0[0]}, [r0]!
-
- bgt .L_secondloop_float
-
-.L_return_float:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global sub_vec2f_neon
- .thumb
- .thumb_func
-
-sub_vec2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t sub_float(arm_vec2f_t * dst,
- @ arm_vec2f_t * src1,
- @ arm_vec2f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- cbz r3, .L_check_vec2
-
- @ load the 1st set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- vsub.f32 q8, q0, q2
- vsub.f32 q9, q1, q3
-
- ble .L_mainloopend_vec2
-
-.L_mainloop_vec2:
- @ store the result for the current set
- vst2.32 {d16,d17,d18,d19}, [r0]!
-
- @ load the next set of values
- vld2.32 {q0-q1}, [r1]!
- vld2.32 {q2-q3}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vsub.f32 q8, q0, q2
- vsub.f32 q9, q1, q3
-
- bgt .L_mainloop_vec2 @ loop if r3 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_vec2:
- @ the last iteration for this call
- @ store the result for the last set
- vst2.32 {d16,d17,d18,d19}, [r0]!
-
-.L_check_vec2:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec2
-
-.L_secondloop_vec2:
- @ process the last few items left in the input array
- vld1.f32 d0, [r1]!
- vld1.f32 d1, [r2]!
-
- subs r4, r4, #1
-
- @ calculate values
- vsub.f32 d0, d0, d1
-
- vst1.32 {d0}, [r0]!
-
- bgt .L_secondloop_vec2
-
-.L_return_vec2:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global sub_vec3f_neon
- .thumb
- .thumb_func
-sub_vec3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t sub_float(arm_vec3f_t * dst,
- @ arm_vec3f_t * src1,
- @ arm_vec3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r3 = count % 4;
- sub r3, r3, r4 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_vec3
-
- @ load the 1st set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d18, d20, d22}, [r2]!
- vld3.32 {d19, d21, d23}, [r2]!
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- vsub.f32 q12, q0, q9
- vsub.f32 q13, q1, q10
- vsub.f32 q14, q2, q11
-
- ble .L_mainloopend_vec3
-
-.L_mainloop_vec3:
- @ store the result for the current set
- vst3.32 {d24, d26, d28}, [r0]!
- vst3.32 {d25, d27, d29}, [r0]!
-
- @ load the next set of values
- vld3.32 {d0, d2, d4}, [r1]!
- vld3.32 {d1, d3, d5}, [r1]!
- vld3.32 {d18, d20, d22}, [r2]!
- vld3.32 {d19, d21, d23}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vsub.f32 q12, q0, q9
- vsub.f32 q13, q1, q10
- vsub.f32 q14, q2, q11
-
- bgt .L_mainloop_vec3 @ loop if r3 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_vec3:
- @ the last iteration for this call
- @ store the result for the last set
- vst3.32 {d24, d26, d28}, [r0]!
- vst3.32 {d25, d27, d29}, [r0]!
-
-.L_check_vec3:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec3
-
-.L_secondloop_vec3:
- @ process the last few items left in the input array
- vld3.f32 {d0[0], d2[0], d4[0]}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, -, -, - };
- @ q1 = { V1.y, -, -, - };
- @ q2 = { V1.z, -, -, - };
- vld3.f32 {d1[0], d3[0], d5[0]}, [r2]! @ The values are loaded like so:
- @ q0 = { V1.x, -, V2.x, - };
- @ q1 = { V1.y, -, V2.y, - };
- @ q2 = { V1.z, -, V2.z, - };
-
- subs r4, r4, #1
-
- @ calculate values for
- vsub.f32 d0, d0, d1
- vsub.f32 d2, d2, d3
- vsub.f32 d4, d4, d5
-
- vst3.32 {d0[0], d2[0], d4[0]}, [r0]!
-
- bgt .L_secondloop_vec3
-
-.L_return_vec3:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
-
-
-
-
- .align 4
- .global sub_vec4f_neon
- .thumb
- .thumb_func
-sub_vec4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t sub_float(arm_vec4f_t * dst,
- @ arm_vec4f_t * src1,
- @ arm_vec4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: *src2 & current src2 entry's address
- @ r3: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r4: the number of items that are left to be processed at the end of
- @ the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4}
- and r4, r3, #3 @ r4 = count % 4;
- sub r3, r3, r4 @ count = count - r4; This is what's left to be processed after this loop
-
- cmp r3, #0
- beq .L_check_vec4
-
- @ load the 1st set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
-
- subs r3, r3, #4 @ 4 for this set
-
- @ calculate values for the 1st set
- vsub.f32 q12, q0, q8
- vsub.f32 q13, q1, q9
- vsub.f32 q14, q2, q10
- vsub.f32 q15, q3, q11
-
- ble .L_mainloopend_vec4
-
-.L_mainloop_vec4:
- @ store the result for the current set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
-
- @ load the next set of values
- vld4.32 {d0, d2, d4, d6}, [r1]!
- vld4.32 {d1, d3, d5, d7}, [r1]!
- vld4.32 {d16, d18, d20, d22}, [r2]!
- vld4.32 {d17, d19, d21, d23}, [r2]!
- subs r3, r3, #4
-
- @ calculate values for the next set
- vsub.f32 q12, q0, q8
- vsub.f32 q13, q1, q9
- vsub.f32 q14, q2, q10
- vsub.f32 q15, q3, q11
-
- bgt .L_mainloop_vec4 @ loop if r3 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_vec4:
- @ the last iteration for this call
- @ store the result for the last set
- vst4.32 {d24, d26, d28, d30}, [r0]!
- vst4.32 {d25, d27, d29, d31}, [r0]!
-
-.L_check_vec4:
- @ check if anything left to process at the end of the input array
- cmp r4, #0
- ble .L_return_vec4
-
-.L_secondloop_vec4:
- @ process the last few items left in the input array
- vld1.f32 {d0, d1}, [r1]! @ The values are loaded like so:
- @ q0 = { V1.x, V1.y, V1.z, V1.w };
- vld1.f32 {d2, d3}, [r2]! @ The values are loaded like so:
- @ q1 = { V2.x, V2.y, V2.z, V2.w };
-
- subs r4, r4, #1
-
- @ calculate values
- vsub.f32 q0, q0, q1
-
- vst1.32 {d0, d1}, [r0]!
-
- bgt .L_secondloop_vec4
-
-.L_return_vec4:
- @ return
- pop {r4}
- mov r0, #0
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_sub_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_x_operation_x.h"
-
-extern arm_result_t sub_float_c (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-//extern arm_result_t sub_float_asm (arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); // the assembly versions haven't been implemented; these are for future use
-extern arm_result_t sub_float_neon(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count);
-
-extern arm_result_t sub_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-//extern arm_result_t sub_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-extern arm_result_t sub_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src1, arm_vec2f_t * src2, unsigned int count);
-
-extern arm_result_t sub_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-//extern arm_result_t sub_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-extern arm_result_t sub_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src1, arm_vec3f_t * src2, unsigned int count);
-
-extern arm_result_t sub_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-//extern arm_result_t sub_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-extern arm_result_t sub_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src1, arm_vec4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) sub_float_c;
- ftbl[ 1] = (arm_func_4args_t) sub_float_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) sub_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) sub_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) sub_vec2f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) sub_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) sub_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) sub_vec3f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) sub_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) sub_vec4f_c;
- ftbl[10] = (arm_func_4args_t) sub_vec4f_c; // using the c version in place of the assembly version
- ftbl[11] = (arm_func_4args_t) sub_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_subc.asm.s
-@
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-
- .balign 4
- .global subc_float_asm
- .thumb
- .thumb_func
-
-subc_float_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t subc_float(arm_vec2f_t * dst,
- @ arm_float_t * src, const arm_float_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndFloat
- mov r5, #0
-
-.LoopBeginFloat:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i]
- vmov s3, r2 @ Get cst into register s3
- vsub.f32 s10, s1, s3 @ s10 = src[i] - cst
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the result back into the main memory
- add r5, r5, #4 @ increase the offset by 1*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginFloat @ Continue if "i < count"
-
-.LoopEndFloat:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global subc_vec2f_asm
- .thumb
- .thumb_func
-
-subc_vec2f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t subc_vec2f(arm_vec2f_t * dst,
- @ arm_vec2f_t * src, const arm_vec2f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec2F
- mov r5, #0
-
-.LoopBeginVec2F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x and src[i].y
- vldr s2, [r6, #4]
- vldr s3, [r2, #0] @ Load cst->x and cst->y
- vldr s4, [r2, #4]
- vsub.f32 s10, s1, s3 @ s10 = src[i].x - cst->x
- vsub.f32 s11, s2, s4 @ s11 = src[i].y - cst->y
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- add r5, r5, #8 @ increase the offset by 2*sizeof(float) @@ (for x and y)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec2F @ Continue if "i < count"
-
-.LoopEndVec2F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global subc_vec3f_asm
- .thumb
- .thumb_func
-
-subc_vec3f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t subc_vec3f(arm_vec3f_t * dst,
- @ arm_vec3f_t * src, const arm_vec3f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec3F
- mov r5, #0
-
-.LoopBeginVec3F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , and src[i].z
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r2, #0] @ Load cst->x, cst->y, and cst->z
- vldr s5, [r2, #4]
- vldr s6, [r2, #8]
- vsub.f32 s10, s1, s4 @ s10 = src[i].x - cst->x
- vsub.f32 s11, s2, s5 @ s11 = src[i].y - cst->y
- vsub.f32 s12, s3, s6 @ s12 = src[i].z - cst->z
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- add r5, r5, #12 @ increase the offset by 3*sizeof(float) @@ (for x, y, and z)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec3F @ Continue if "i < count"
-
-.LoopEndVec3F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
-
-
-
-
- .balign 4
- .global subc_vec4f_asm
- .thumb
- .thumb_func
-
-subc_vec4f_asm:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t subc_vec4f(arm_vec4f_t * dst,
- @ arm_vec4f_t * src, const arm_vec4f_t * cst,
- @ unsigned int count)
- @
- @ r0: *dst
- @ r1: *src
- @ r2: *cst
- @ r3: int count
- @
- @ r3: loop counter
- @ r5: current item's offset in both src[] and dst[]
- @ r6: current source item's address made of base(r1)+offset(r5)
- @ r7: current destination item's address made of base(r0)+offset(r5)
- @
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- push {r4, r5, r6, r7}
- cbz r3, .LoopEndVec4F
- mov r5, #0
-
-.LoopBeginVec4F:
- add r6, r1, r5 @ Get current source item's address in memory
- vldr s1, [r6, #0] @ Load src[i].x, src[i].y , src[i].z, and w
- vldr s2, [r6, #4]
- vldr s3, [r6, #8]
- vldr s4, [r6, #12]
- vldr s5, [r2, #0] @ Load cst->x, cst->y, cst->z, and w
- vldr s6, [r2, #4]
- vldr s7, [r2, #8]
- vldr s8, [r2, #12]
- vsub.f32 s10, s1, s5 @ s10 = src[i].x - cst->x
- vsub.f32 s11, s2, s6 @ s11 = src[i].y - cst->y
- vsub.f32 s12, s3, s7 @ s12 = src[i].z - cst->z
- vsub.f32 s13, s4, s8 @ s13 = src[i].w - cst->w
- add r7, r0, r5 @ Get current destination item's address in memory
- vstr s10, [r7, #0] @ Store the results back into the main memory
- vstr s11, [r7, #4]
- vstr s12, [r7, #8]
- vstr s13, [r7, #12]
- add r5, r5, #16 @ increase the offset by 4*sizeof(float) @@ (for x, y, z, and w)
- subs r3, r3, #1 @ count down using the current index (i--)
- bne .LoopBeginVec4F @ Continue if "i < count"
-
-.LoopEndVec4F:
- mov r0, NE10_OK @ Return NE10_OK
- pop {r4, r5, r6, r7}
- bx lr
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_subc.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t subc_float_c(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ] = src[ itr ] - cst;
- );
-}
-
-arm_result_t subc_vec2f_c(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x - cst->x;
- dst[ itr ].y = src[ itr ].y - cst->y;
- );
-}
-
-arm_result_t subc_vec3f_c(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x - cst->x;
- dst[ itr ].y = src[ itr ].y - cst->y;
- dst[ itr ].z = src[ itr ].z - cst->z;
- );
-}
-
-arm_result_t subc_vec4f_c(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_X_C
- (
- dst[ itr ].x = src[ itr ].x - cst->x;
- dst[ itr ].y = src[ itr ].y - cst->y;
- dst[ itr ].z = src[ itr ].z - cst->z;
- dst[ itr ].w = src[ itr ].w - cst->w;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_subc.neon.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-#include <arm_neon.h>
-
-
-arm_result_t subc_float_neon(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count)
-{
- NE10_XC_OPERATION_FLOAT_NEON
- (
- n_dst = vsubq_f32( n_src , n_cst );
- ,
- n_tmp_src = vsub_f32( n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t subc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC2F_NEON
- (
- n_dst = vsubq_f32( n_src , n_cst );
- ,
- n_tmp_src = vsub_f32( n_tmp_src, n_tmp_cst );
- );
-}
-
-arm_result_t subc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC3F_NEON
- (
- n_dst1 = vsubq_f32( n_src1 , n_cst1 );
- n_dst2 = vsubq_f32( n_src2 , n_cst2 );
- n_dst3 = vsubq_f32( n_src3 , n_cst3 );
- ,
- n_tmp_src.val[0] = vsub_f32( n_tmp_src.val[0], n_tmp_cst.val[0] );
- n_tmp_src.val[1] = vsub_f32( n_tmp_src.val[1], n_tmp_cst.val[1] );
- n_tmp_src.val[2] = vsub_f32( n_tmp_src.val[2], n_tmp_cst.val[2] );
- );
-}
-
-arm_result_t subc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count)
-{
- NE10_XC_OPERATION_VEC4F_NEON
- (
- n_dst = vsubq_f32( n_src , n_cst );
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_subc_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN
-// number of the operations in a given unit
-#define OP_COUNT 4
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xc_operation_x.h"
-
-
-extern arm_result_t subc_float_c (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t subc_float_asm (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-extern arm_result_t subc_float_neon (arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count);
-
-extern arm_result_t subc_vec2f_c (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t subc_vec2f_asm (arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-extern arm_result_t subc_vec2f_neon(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count);
-
-extern arm_result_t subc_vec3f_c (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t subc_vec3f_asm (arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-extern arm_result_t subc_vec3f_neon(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count);
-
-extern arm_result_t subc_vec4f_c (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-extern arm_result_t subc_vec4f_asm (arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-extern arm_result_t subc_vec4f_neon(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count);
-
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) subc_float_c;
- ftbl[ 1] = (arm_func_4args_t) subc_float_asm;
- ftbl[ 2] = (arm_func_4args_t) subc_float_neon;
-
- ftbl[ 3] = (arm_func_4args_t) subc_vec2f_c;
- ftbl[ 4] = (arm_func_4args_t) subc_vec2f_asm;
- ftbl[ 5] = (arm_func_4args_t) subc_vec2f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) subc_vec3f_c;
- ftbl[ 7] = (arm_func_4args_t) subc_vec3f_asm;
- ftbl[ 8] = (arm_func_4args_t) subc_vec3f_neon;
-
- ftbl[ 9] = (arm_func_4args_t) subc_vec4f_c;
- ftbl[10] = (arm_func_4args_t) subc_vec4f_asm;
- ftbl[11] = (arm_func_4args_t) subc_vec4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_submat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_submat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-
-#include <assert.h>
-
-arm_result_t submat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = src1[ itr ].c1.r1 - src2[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src1[ itr ].c1.r2 - src2[ itr ].c1.r2;
-
- dst[ itr ].c2.r1 = src1[ itr ].c2.r1 - src2[ itr ].c2.r1;
- dst[ itr ].c2.r2 = src1[ itr ].c2.r2 - src2[ itr ].c2.r2;
- );
-}
-
-arm_result_t submat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = src1[ itr ].c1.r1 - src2[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src1[ itr ].c1.r2 - src2[ itr ].c1.r2;
- dst[ itr ].c1.r3 = src1[ itr ].c1.r3 - src2[ itr ].c1.r3;
-
- dst[ itr ].c2.r1 = src1[ itr ].c2.r1 - src2[ itr ].c2.r1;
- dst[ itr ].c2.r2 = src1[ itr ].c2.r2 - src2[ itr ].c2.r2;
- dst[ itr ].c2.r3 = src1[ itr ].c2.r3 - src2[ itr ].c2.r3;
-
- dst[ itr ].c3.r1 = src1[ itr ].c3.r1 - src2[ itr ].c3.r1;
- dst[ itr ].c3.r2 = src1[ itr ].c3.r2 - src2[ itr ].c3.r2;
- dst[ itr ].c3.r3 = src1[ itr ].c3.r3 - src2[ itr ].c3.r3;
- );
-}
-
-arm_result_t submat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count)
-{
- NE10_X_OPERATION_FLOAT_C
- (
- dst[ itr ].c1.r1 = src1[ itr ].c1.r1 - src2[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src1[ itr ].c1.r2 - src2[ itr ].c1.r2;
- dst[ itr ].c1.r3 = src1[ itr ].c1.r3 - src2[ itr ].c1.r3;
- dst[ itr ].c1.r4 = src1[ itr ].c1.r4 - src2[ itr ].c1.r4;
-
- dst[ itr ].c2.r1 = src1[ itr ].c2.r1 - src2[ itr ].c2.r1;
- dst[ itr ].c2.r2 = src1[ itr ].c2.r2 - src2[ itr ].c2.r2;
- dst[ itr ].c2.r3 = src1[ itr ].c2.r3 - src2[ itr ].c2.r3;
- dst[ itr ].c2.r4 = src1[ itr ].c2.r4 - src2[ itr ].c2.r4;
-
- dst[ itr ].c3.r1 = src1[ itr ].c3.r1 - src2[ itr ].c3.r1;
- dst[ itr ].c3.r2 = src1[ itr ].c3.r2 - src2[ itr ].c3.r2;
- dst[ itr ].c3.r3 = src1[ itr ].c3.r3 - src2[ itr ].c3.r3;
- dst[ itr ].c3.r4 = src1[ itr ].c3.r4 - src2[ itr ].c3.r4;
-
- dst[ itr ].c4.r1 = src1[ itr ].c4.r1 - src2[ itr ].c4.r1;
- dst[ itr ].c4.r2 = src1[ itr ].c4.r2 - src2[ itr ].c4.r2;
- dst[ itr ].c4.r3 = src1[ itr ].c4.r3 - src2[ itr ].c4.r3;
- dst[ itr ].c4.r4 = src1[ itr ].c4.r4 - src2[ itr ].c4.r4;
- );
-}
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "NE10.h"
-
-arm_result_t submat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count)
-{
- return sub_vec2f_neon( (arm_vec2f_t*)dst, (arm_vec2f_t*)src1, (arm_vec2f_t*)src2, count*2 );
-}
-
-arm_result_t submat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count)
-{
- return sub_vec3f_neon( (arm_vec3f_t*)dst, (arm_vec3f_t*)src1, (arm_vec3f_t*)src2, count*3 );
-}
-
-arm_result_t submat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count)
-{
- return sub_vec4f_neon( (arm_vec4f_t*)dst, (arm_vec4f_t*)src1, (arm_vec4f_t*)src2, count*4 );
-}
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_submat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_xmat_operation_x.h"
-
-extern arm_result_t submat_2x2f_c (arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-extern arm_result_t submat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src1, arm_mat2x2f_t * src2, unsigned int count);
-
-extern arm_result_t submat_3x3f_c (arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-extern arm_result_t submat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src1, arm_mat3x3f_t * src2, unsigned int count);
-
-extern arm_result_t submat_4x4f_c (arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-extern arm_result_t submat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src1, arm_mat4x4f_t * src2, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_4args_t) submat_2x2f_c;
- ftbl[ 1] = (arm_func_4args_t) submat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_4args_t) submat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_4args_t) submat_3x3f_c;
- ftbl[ 4] = (arm_func_4args_t) submat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_4args_t) submat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_4args_t) submat_4x4f_c;
- ftbl[ 7] = (arm_func_4args_t) submat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_4args_t) submat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_transmat.asm.s
-@
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_addmat.c
- */
-
-#include "NE10.h"
-#include "../headers/macros.h"
-#include <math.h>
-
-#include <assert.h>
-
-inline void swap( arm_float_t *a, arm_float_t *b )
-{
- arm_float_t tmp = *a;
- *a = *b;
- *b = tmp;
-}
-
-arm_result_t transmat_2x2f_c(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count)
-{
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ].c1.r1 = src[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src[ itr ].c2.r1;
- dst[ itr ].c2.r1 = src[ itr ].c1.r2;
- dst[ itr ].c2.r2 = src[ itr ].c2.r2;
- );
-}
-
-arm_result_t transmat_3x3f_c(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count)
-{
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ].c1.r1 = src[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src[ itr ].c2.r1;
- dst[ itr ].c1.r3 = src[ itr ].c3.r1;
-
- dst[ itr ].c2.r1 = src[ itr ].c1.r2;
- dst[ itr ].c2.r2 = src[ itr ].c2.r2;
- dst[ itr ].c2.r3 = src[ itr ].c3.r2;
-
- dst[ itr ].c3.r1 = src[ itr ].c1.r3;
- dst[ itr ].c3.r2 = src[ itr ].c2.r3;
- dst[ itr ].c3.r3 = src[ itr ].c3.r3;
- );
-}
-
-arm_result_t transmat_4x4f_c(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count)
-{
- NE10_DETMAT_OPERATION_X_C
- (
- dst[ itr ].c1.r1 = src[ itr ].c1.r1;
- dst[ itr ].c1.r2 = src[ itr ].c2.r1;
- dst[ itr ].c1.r3 = src[ itr ].c3.r1;
- dst[ itr ].c1.r4 = src[ itr ].c4.r1;
-
- dst[ itr ].c2.r1 = src[ itr ].c1.r2;
- dst[ itr ].c2.r2 = src[ itr ].c2.r2;
- dst[ itr ].c2.r3 = src[ itr ].c3.r2;
- dst[ itr ].c2.r4 = src[ itr ].c4.r2;
-
- dst[ itr ].c3.r1 = src[ itr ].c1.r3;
- dst[ itr ].c3.r2 = src[ itr ].c2.r3;
- dst[ itr ].c3.r3 = src[ itr ].c3.r3;
- dst[ itr ].c3.r4 = src[ itr ].c4.r3;
-
- dst[ itr ].c4.r1 = src[ itr ].c1.r4;
- dst[ itr ].c4.r2 = src[ itr ].c2.r4;
- dst[ itr ].c4.r3 = src[ itr ].c3.r4;
- dst[ itr ].c4.r4 = src[ itr ].c4.r4;
- );
-}
+++ /dev/null
-@
-@ Copyright 2011-12 ARM Limited
-@
-@ Licensed under the Apache License, Version 2.0 (the "License");
-@ you may not use this file except in compliance with the License.
-@ You may obtain a copy of the License at
-@
-@ http://www.apache.org/licenses/LICENSE-2.0
-@
-@ Unless required by applicable law or agreed to in writing, software
-@ distributed under the License is distributed on an "AS IS" BASIS,
-@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ See the License for the specific language governing permissions and
-@ limitations under the License.
-@
-
-@
-@ NE10 Library : source/NE10_transmat.neon.s
-@
-
-
-
-
- .text
- .syntax unified
-
-.include "headers/NE10header.s"
-.include "source/NE10_detmat.neon.inc.s"
-
-
-
-
- .balign 4
- .global transmat_2x2f_neon
- .thumb
- .thumb_func
-
-transmat_2x2f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t transmat_2x2f(arm_mat2x2f_t * dst,
- @ arm_mat2x2f_t * src,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat2x2
-
-.L_mainloop_mat2x2:
-
- subs r2, r2, #4
-
- vld4.32 {d16, d18, d20, d22}, [r1]!
- vld4.32 {d17, d19, d21, d23}, [r1]!
-
- vswp q9, q10
-
- vst4.32 {d16, d18, d20, d22}, [r0]!
- vst4.32 {d17, d19, d21, d23}, [r0]!
-
- bgt .L_mainloop_mat2x2 @ loop if r2 > 0, if we have at least another 4 vectors (8 floats) to process
-
-.L_mainloopend_mat2x2:
-
-.L_check_mat2x2:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat2x2
-
-.L_secondloop_mat2x2:
- @ process the last few items left in the input array
- vld4.32 {d16[0], d18[0], d20[0], d22[0]}, [r1]!
-
- vswp d18, d20
-
- subs r3, r3, #1
-
- vst4.32 {d16[0], d18[0], d20[0], d22[0]}, [r0]!
-
- bgt .L_secondloop_mat2x2
-
-.L_return_mat2x2:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the inverse of two 3x3 marices
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_3x3TRNMATS
- @ rearrange the results for use in a "vst3" instruction...
- vtrn.32 q8 , q11
- vtrn.32 q9 , q12
- vtrn.32 q10, q13
-
- vst3.32 { d16 , d18 , d20 }, [r0]!
- vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
- vst3.32 { d22 , d24 , d26 }, [r0]!
- vst3.32 { d23[0], d25[0], d27[0]}, [r0]!
- .endm
-
-
-
-
- .align 2
- .global transmat_3x3f_neon
- .thumb
- .thumb_func
-transmat_3x3f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t transmat_3x3f(arm_mat3x3f_t * dst,
- @ arm_mat3x3f_t * src1,
- @ arm_mat3x3f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r2 = count % 4;
- sub r2, r2, r3 @ count = count - r2; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat3x3
-
-.L_mainloop_mat3x3:
- LOAD_3x3MATS_ARGS d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, q8, q9, q10, q11, q12, q13, r1
-
- subs r2, r2, #2
-
- vswp d20, d17
- vswp d22, d18
- vswp d26, d19
-
- STORE_3x3TRNMATS
-
- bgt .L_mainloop_mat3x3 @ loop if r2 > 0, if we have at least another 4 vectors (12 floats) to process
-
-.L_mainloopend_mat3x3:
-
-.L_check_mat3x3:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat3x3
-
-.L_secondloop_mat3x3:
- @ process the last few items left in the input array
- @ load the next (e.g. 3rd) set of values
- vld3.32 { d16 , d18 , d20 }, [r1]!
- vld3.32 { d17[0], d19[0], d21[0]}, [r1]!
-
- vtrn.32 q8 , q11
- vtrn.32 q9 , q12
- vtrn.32 q10, q13
-
- subs r3, r3, #1
-
- vswp d20, d17
- vswp d22, d18
- vswp d26, d19
-
-
-
- @ store the result for the last (e.g. 3rd) set
- vtrn.32 q8 , q11
- vtrn.32 q9 , q12
- vtrn.32 q10, q13
-
- vst3.32 { d16 , d18 , d20 }, [r0]!
- vst3.32 { d17[0], d19[0], d21[0]}, [r0]!
-
- bgt .L_secondloop_mat3x3
-
-.L_return_mat3x3:
- @ return
- mov r0, #0
- bx lr
-
-
-
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ This macro calculates the inverse of two 4x4 marices
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- .macro STORE_4x4INVMATS
- @ rearrange the results for use in a "vst3" instruction...
- vtrn.32 q8, q12
- vtrn.32 q9, q13
- vtrn.32 q10, q14
- vtrn.32 q11, q15
-
- vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
- vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
- vst4.32 { d24 , d26 , d28 , d30 }, [r0]!
- vst4.32 { d25 , d27 , d29 , d31 }, [r0]!
- .endm
-
-
-
-
- .align 2
- .global transmat_4x4f_neon
- .thumb
- .thumb_func
-transmat_4x4f_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @
- @ arm_result_t transmat_4x4f(arm_mat4x4f_t * dst,
- @ arm_mat4x4f_t * src1,
- @ arm_mat4x4f_t * src2,
- @ unsigned int count)
- @
- @ r0: *dst & current dst entry's address
- @ r1: *src1 & current src1 entry's address
- @ r2: int count & the number of items in the input array that can be
- @ processed in chunks of 4 vectors
- @
- @ r3: the number of items that are left to be processed at the end
- @ of the input array
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-
- and r3, r2, #3 @ r3 = count % 4;
- sub r2, r2, r3 @ count = count - r3; This is what's left to be processed after this loop
-
- cmp r2, #0
- beq .L_check_mat4x4
-
-.L_mainloop_mat4x4:
-
- LOAD_4x4MATS_ARGS d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, q8, q9, q10, q11, q12, q13, q14, q15, r1
-
-
- subs r2, r2, #2
-
- vswp d18, d24
- vswp d17, d20
- vswp d22, d25
- vswp d19, d28
- vswp d27, d30
- vswp d23, d29
-
-
- STORE_4x4INVMATS
-
- bgt .L_mainloop_mat4x4 @ loop if r2 > 0, if we have at least another 4 vectors (16 floats) to process
-
-.L_mainloopend_mat4x4:
-
-.L_check_mat4x4:
- @ check if anything left to process at the end of the input array
- cmp r3, #0
- ble .L_return_mat4x4
-
-.L_secondloop_mat4x4:
- @ process the last few items left in the input array
- vld4.32 { d16 , d18 , d20 , d22 }, [r1]!
- vld4.32 { d17 , d19 , d21 , d23 }, [r1]!
-
- vtrn.32 q8, q12
- vtrn.32 q9, q13
- vtrn.32 q10, q14
- vtrn.32 q11, q15
-
- subs r3, r3, #1
-
- vswp d18, d24
- vswp d17, d20
- vswp d22, d25
- vswp d19, d28
- vswp d27, d30
- vswp d23, d29
-
-
- @ store the results
- vtrn.32 q8, q12
- vtrn.32 q9, q13
- vtrn.32 q10, q14
- vtrn.32 q11, q15
-
- vst4.32 { d16 , d18 , d20 , d22 }, [r0]!
- vst4.32 { d17 , d19 , d21 , d23 }, [r0]!
-
-
- bgt .L_secondloop_mat4x4
-
-.L_return_mat4x4:
- @ return
- mov r0, #0
- bx lr
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NE10 Library : source/NE10_transmat_test.c
- */
-
-//Make sure the following are defined before including "unit_test.h"
-
-// length of the data arrays
-#define ARRLEN TEST_ARRLEN_MATRICES
-// number of the operations in a given unit
-#define OP_COUNT 3
-// number of the different implementations of each of the functions (C, ASM, NEON, ...)
-#define IMPL_COUNT 3
-
-
-#include "../headers/unit_test_invmat_operation_x.h"
-
-extern arm_result_t transmat_2x2f_c (arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-extern arm_result_t transmat_2x2f_neon(arm_mat2x2f_t * dst, arm_mat2x2f_t * src, unsigned int count);
-
-extern arm_result_t transmat_3x3f_c (arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-extern arm_result_t transmat_3x3f_neon(arm_mat3x3f_t * dst, arm_mat3x3f_t * src, unsigned int count);
-
-extern arm_result_t transmat_4x4f_c (arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-extern arm_result_t transmat_4x4f_neon(arm_mat4x4f_t * dst, arm_mat4x4f_t * src, unsigned int count);
-
-void init_ftbl()
-{
- // manually initialize the global function table with
- // those functions that do have an actual implementation.
- ftbl[ 0] = (arm_func_3args_t) transmat_2x2f_c;
- ftbl[ 1] = (arm_func_3args_t) transmat_2x2f_c; // using the c version in place of the assembly version
- ftbl[ 2] = (arm_func_3args_t) transmat_2x2f_neon;
-
- ftbl[ 3] = (arm_func_3args_t) transmat_3x3f_c;
- ftbl[ 4] = (arm_func_3args_t) transmat_3x3f_c; // using the c version in place of the assembly version
- ftbl[ 5] = (arm_func_3args_t) transmat_3x3f_neon;
-
- ftbl[ 6] = (arm_func_3args_t) transmat_4x4f_c;
- ftbl[ 7] = (arm_func_3args_t) transmat_4x4f_c; // using the c version in place of the assembly version
- ftbl[ 8] = (arm_func_3args_t) transmat_4x4f_neon;
-}
-
-arm_result_t main( int argc, char **argv )
-{
- return run_test( argc, argv ); // defined in "unit_test.h"
-}
+++ /dev/null
-#
-# Copyright 2011-12 ARM Limited
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# header
-include_directories (
- ../inc
- ../
-)
-
-if(NE10_BUILD_SHARED)
- add_executable(NE10_test_dynamic NE10_test.c)
- target_link_libraries (
- NE10_test_dynamic
- NE10_test
- m
- )
-endif()
-
-if(NE10_BUILD_STATIC)
- add_executable(NE10_test_static NE10_test.c)
- target_link_libraries (
- NE10_test_static
- NE10
- m
- )
-endif()
-
-
+++ /dev/null
-/*
- * Copyright 2011-12 ARM Limited
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../inc/NE10.h"
-#include "./NE10_init.h"
-
-// This test code shows you how you can statically embed NE10 in your code
-void main()
-{
- printf ( "Going to initialze NE10...\n" );
-
- NE10_init();
-
- printf ( "NE10 has been initialized.\n" );
-
-}
-
--- /dev/null
+#!/bin/sh
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# NE10 Library : astyle.sh
+#
+#!/bin/bash
+
+# This script is used to format the source code
+#################################################
+
+if [ $# -ge 1 ]; then
+ CMD_OPS=$*
+else
+ echo "too less param..."
+ echo " eg: ${0} a.c"
+ echo "-r: recursive process subdirectories."
+ echo " eg: ${0} -r \"\${DIR}/\\*.cpp \${DIR}/\\*.h\""
+ exit
+fi
+
+FORMAT_OPS="--style=ansi --options=none -p -d -U -c"
+
+astyle ${FORMAT_OPS} ${CMD_OPS}
--- /dev/null
+#!/bin/sh
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# NE10 Library : cleanall.sh
+#
+
+PRODUCT_NAME=NE10
+
+rm *.ex *.a *.o *.so
+rm res_*.txt
+rm .*.swp
+rm .exp.tmp
+rm testlog.txt
+for dir in `find * -maxdepth 0 -type d -name "${PRODUCT_NAME}_*"`; do rm -rf $dir; done;
+rm -rf ./java
+for fl in `find * -maxdepth 0 -type f -name "${PRODUCT_NAME}_*.tgz"`; do rm -rf $fl; done;
+if [ "$CLS" != "0" ]; then
+ clear
+ echo
+ ls -la --color=auto
+ echo
+fi
+echo
+
--- /dev/null
+#!/bin/sh
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# NE10 Library : getlog.sh
+#
+
+echo "NE10 NIGHTLY BUILD SCRIPT"
+echo "(C) 2011, ARM Ltd."
+date
+
+echo
+echo
+echo -e "\033[4mSYSTEM:\033[0m"
+uname -a
+cat /proc/cpuinfo
+
+echo
+echo
+echo -e "\033[4mINSTALLED TOOLS:\033[0m"
+echo "git:"
+if [ "`which git`" = "" ]; then
+ echo "fatal: 'git' is not installed on this system" 1>&2
+ exit 1
+fi
+git --version | paste -s -d ';' -
+echo
+echo "gcc:"
+if [ "`which gcc`" = "" ]; then
+ echo "fatal: 'gcc' is not installed on this system" 1>&2
+ exit 1
+fi
+gcc --version | paste -s -d ';' -
+echo
+echo "as:"
+if [ "`which as`" = "" ]; then
+ echo "fatal: 'as' is not installed on this system" 1>&2
+ exit 1
+fi
+as --version | paste -s -d ';' -
+echo
+echo "ar:"
+if [ "`which ar`" = "" ]; then
+ echo "fatal: 'ar' is not installed on this system" 1>&2
+ exit 1
+fi
+ar --version | paste -s -d ';' -
+echo
+echo
+echo "perl:"
+if [ "`which perl`" = "" ]; then
+ echo "fatal: 'perl' is not installed on this system" 1>&2
+ exit 1
+fi
+perl --version | paste -s -d ';' -
+
+echo
+if [ -e .git ]; then
+ echo
+ echo -e "\033[4mCURRENT 'git' CONFIGURATION:\033[0m"
+ git config -l
+fi
+
+echo
+echo
+echo -e "\033[4mCURRENT USER AND PATH:\033[0m"
+echo `whoami` "@" `pwd`
+
+echo
+echo
+echo -e "\033[4mENVIRONMENT VARIABLES:\033[0m"
+echo
+echo "PATH = " $PATH
+echo
+echo "LD_LIBRARY_PATH = " $LD_LIBRARY_PATH
+
+
+echo
+if [ -e .git ]; then
+echo
+echo -e "\033[4mCURRENT GIT/SOURCE STATUS:\033[0m"
+ git show
+fi
+
+
--- /dev/null
+#!/bin/sh
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# NE10 Library : removetabs.sh
+#
+#!/bin/bash
+
+# This script removes tab characters in files and replaces them with
+# the right number of spaces. It also removes trailing whitespaces.
+
+# remove trailing whitespaces
+LSw=`grep -lsri --exclude="Makefile" --exclude-dir=".git" '\s$' .`;
+for flw in $LSw
+do
+ echo "HAS SPACES: " $flw; # just to see a list of the files that include unwanted tabs
+ perms=`stat -c '%a' $flw`;
+ sed 's/[ \t]*$//gi' $flw > .exp.tmp;
+ sync;
+ # rename the file to the original file
+ mv .exp.tmp $flw;
+ chmod $perms $flw;
+ sync;
+done
+
+# remove tabs
+chtab=$'\t'; # only works in bash but not in sh
+LSt=`grep -lrsi --exclude="Makefile" --exclude-dir=".git" "$chtab" .`;
+for flt in $LSt
+do
+ echo "HAS TABS: " $flt; # just to see a list of the files that include unwanted tabs
+ perms=`stat -c '%a' $flt`;
+ # remove tabs
+ expand $flt > .exp.tmp;
+ sync;
+ # rename the file to the original file
+ mv .exp.tmp $flt;
+ chmod $perms $flt;
+ sync;
+done
+
--- /dev/null
+#!/bin/sh
+#
+# Copyright 2011-12 ARM Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# NE10 Library : review.sh
+#
+
+BRANCH=$1
+
+BASE=${2-"master"}
+
+if [ "$BRANCH" = "" ]; then
+ echo "Usage: review.sh <branch to review> [parent branch]"
+ exit
+else
+
+ LABEL=`echo $1 | perl -pe '$_ =~ /dev\/([a-zA-Z0-9]+)\/(.+)/;$_=$2'`
+ GLUSER=`echo $1 | perl -pe '$_ =~ /dev\/([a-zA-Z0-9]+)\/(.+)/;$_=$1'`
+
+ NEWBRANCH="staging/$GLUSER/$LABEL"
+
+ echo "Pushing $BRANCH from $BASE for review as $NEWBRANCH"
+
+ git branch $NEWBRANCH $BASE
+ git push origin $NEWBRANCH
+ git checkout $NEWBRANCH
+ git rebase $BRANCH
+ git push origin $NEWBRANCH
+
+fi
+