From b1a750f68ffa52dc9a817f6d70f2515381ba7ac6 Mon Sep 17 00:00:00 2001 From: Dave Butcher Date: Thu, 27 Oct 2011 16:06:46 +0100 Subject: [PATCH] Fixes against RC1 --- Android.mk | 4 +- Makefile | 18 +++++- NE10_init.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ NE10_init.h | 18 ++++++ NE10_test.c | 17 ++++++ cleanall.sh | 3 +- doxy.conf | 2 +- headers/factor.h | 5 +- 8 files changed, 231 insertions(+), 7 deletions(-) create mode 100644 NE10_init.c create mode 100644 NE10_init.h create mode 100644 NE10_test.c mode change 100755 => 100644 cleanall.sh diff --git a/Android.mk b/Android.mk index a252399..50dd810 100644 --- a/Android.mk +++ b/Android.mk @@ -8,11 +8,11 @@ ne10_neon_source := \ source/NE10_addc.neon.c \ source/NE10_normalize.neon.s \ source/NE10_mlac.neon.c \ - source/NE10_abs.neon.c \ + source/NE10_abs.neon.s \ source/NE10_div.neon.c \ source/NE10_add.neon.c \ source/NE10_divc.neon.c \ - source/NE10_mul.neon.c \ + source/NE10_mul.neon.s \ source/NE10_len.neon.s \ source/NE10_sub.neon.c \ source/NE10_rsbc.neon.c \ diff --git a/Makefile b/Makefile index 4629871..1a20ed3 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,4 @@ +# COPYRIGHT NOTICE TBD NOT FOR RELEASE C_TOOL = gcc EXE_TOOL = gcc @@ -13,15 +14,29 @@ OPTIMIZE_FLAGS = -O3 LDFLAGS+=-L. -L/usr/local/lib -L/client/lib -L/lib/arm-linux-gnueabi LDFLAGS+=-lm +ALLFILES = NE10_addc.c_r.o NE10_subc.c_r.o NE10_rsbc.c_r.o NE10_mulc.c_r.o NE10_divc.c_r.o NE10_mlac.c_r.o NE10_setc.c_r.o NE10_add.c_r.o NE10_sub.c_r.o NE10_mul.c_r.o NE10_div.c_r.o NE10_mla.c_r.o NE10_abs.c_r.o NE10_len.c_r.o NE10_normalize.c_r.o NE10_addc.neon_r.o NE10_subc.neon_r.o NE10_rsbc.neon_r.o NE10_mulc.neon_r.o NE10_divc.neon_r.o NE10_mlac.neon_r.o NE10_setc.neon_r.o NE10_add.neon_r.o NE10_sub.neon_r.o NE10_mul.neon_r.o NE10_div.neon_r.o NE10_mla.neon_r.o NE10_abs.neon_r.o NE10_len.neon_r.o NE10_normalize.neon_r.o + #TARGET_ARCH = stdc .PHONY: all clean -all : NE10_addc.test_r.ex +all: NE10_test_static.ex NE10_test_dynamic.ex clean: ./cleanall.sh +NE10_test_static.ex : libNE10.a NE10_init.h NE10_test.c + $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) ./NE10_init.c ./NE10_test.c -o $@ -l:libNE10.a $(C_FLAGS) -L/lib/arm-linux-gnueabi + +NE10_test_dynamic.ex : libNE10.so NE10_init.h NE10_test.c + $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) ./NE10_init.c ./NE10_test.c -o $@ -l:libNE10.so $(C_FLAGS) -L/lib/arm-linux-gnueabi + +libNE10.a : $(ALLFILES) NE10_init.h NE10_init.c + ar rcs libNE10.a $(ALLFILES) + +libNE10.so : $(ALLFILES) NE10_init.h NE10_init.c + gcc -shared -o $@ $(ALLFILES) + %.test_r.ex : %.asm_r.o %.c_r.o %.neon_r.o ./source/%_test.c ./inc/NE10.h $(EXE_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) $^ -o $@ $(C_FLAGS) -L/lib/arm-linux-gnueabi @@ -40,4 +55,3 @@ clean: # Rules for the C version %.neon_r.o : ./source/%.neon.c ./inc/NE10.h $(C_TOOL) $(OPTIMIZE_FLAGS) $(ARM_FLAGS) -mfpu=neon -c $< -o $@ $(C_FLAGS) - diff --git a/NE10_init.c b/NE10_init.c new file mode 100644 index 0000000..1e2d560 --- /dev/null +++ b/NE10_init.c @@ -0,0 +1,171 @@ +// COPYRIGHT NOTICE TBD NOT FOR RELEASE + +#include "NE10.h" + +#include + +#define CPUINFO_BUFFER_SIZE (1024*4) + +// This local variable indicates whether or not the running platform supports ARM NEON +arm_result_t is_NEON_available = NE10_ERR; + +arm_result_t NE10_HasNEON() +{ + return is_NEON_available; +} + +arm_result_t NE10_init() +{ + FILE* infofile = NULL; // To open the file /proc/cpuinfo + char cpuinfo[CPUINFO_BUFFER_SIZE]; // The buffer to read in the string + size_t bytes = 0; // Numbers of bytes read from the file + int i = 0; // Temporary loop counter + + memset( cpuinfo, 0, CPUINFO_BUFFER_SIZE ); + infofile = fopen( "/proc/cpuinfo", "r" ); + bytes = fread( cpuinfo, 1, sizeof(cpuinfo), infofile ); + fclose( infofile ); + + if( 0 == bytes || CPUINFO_BUFFER_SIZE == bytes ) + { + fprintf( stderr, "ERROR: Couldn't read the file \"/proc/cpuinfo\". NE10_init() failed.\n"); + return NE10_ERR; + } + + while( '\0' != cpuinfo[i] ) cpuinfo[i++] = (char)tolower(cpuinfo[i]); + + if ( 0 != strstr(cpuinfo, "neon") ) + { + is_NEON_available = NE10_OK; + } + + if ( NE10_OK == NE10_HasNEON() ) + { + addc_float = addc_float_neon; + addc_vec2f = addc_vec2f_neon; + addc_vec3f = addc_vec3f_neon; + addc_vec4f = addc_vec4f_neon; + subc_float = subc_float_neon; + subc_vec2f = subc_vec2f_neon; + subc_vec3f = subc_vec3f_neon; + subc_vec4f = subc_vec4f_neon; + rsbc_float = rsbc_float_neon; + rsbc_vec2f = rsbc_vec2f_neon; + rsbc_vec3f = rsbc_vec3f_neon; + rsbc_vec4f = rsbc_vec4f_neon; + mulc_float = mulc_float_neon; + mulc_vec2f = mulc_vec2f_neon; + mulc_vec3f = mulc_vec3f_neon; + mulc_vec4f = mulc_vec4f_neon; + divc_float = divc_float_neon; + divc_vec2f = divc_vec2f_neon; + divc_vec3f = divc_vec3f_neon; + divc_vec4f = divc_vec4f_neon; + setc_float = setc_float_neon; + setc_vec2f = setc_vec2f_neon; + setc_vec3f = setc_vec3f_neon; + setc_vec4f = setc_vec4f_neon; + mlac_float = mlac_float_neon; + mlac_vec2f = mlac_vec2f_neon; + mlac_vec3f = mlac_vec3f_neon; + mlac_vec4f = mlac_vec4f_neon; + add_float = add_float_neon; + sub_float = sub_float_neon; + mul_float = mul_float_neon; + div_float = div_float_neon; + mla_float = mla_float_neon; + abs_float = abs_float_neon; + len_vec2f = len_vec2f_neon; + len_vec3f = len_vec3f_neon; + len_vec4f = len_vec4f_neon; + normalize_vec2f = normalize_vec2f_neon; + normalize_vec3f = normalize_vec3f_neon; + normalize_vec4f = normalize_vec4f_neon; + } + else + { + addc_float = addc_float_c; + addc_vec2f = addc_vec2f_c; + addc_vec3f = addc_vec3f_c; + addc_vec4f = addc_vec4f_c; + subc_float = subc_float_c; + subc_vec2f = subc_vec2f_c; + subc_vec3f = subc_vec3f_c; + subc_vec4f = subc_vec4f_c; + rsbc_float = rsbc_float_c; + rsbc_vec2f = rsbc_vec2f_c; + rsbc_vec3f = rsbc_vec3f_c; + rsbc_vec4f = rsbc_vec4f_c; + mulc_float = mulc_float_c; + mulc_vec2f = mulc_vec2f_c; + mulc_vec3f = mulc_vec3f_c; + mulc_vec4f = mulc_vec4f_c; + divc_float = divc_float_c; + divc_vec2f = divc_vec2f_c; + divc_vec3f = divc_vec3f_c; + divc_vec4f = divc_vec4f_c; + setc_float = setc_float_c; + setc_vec2f = setc_vec2f_c; + setc_vec3f = setc_vec3f_c; + setc_vec4f = setc_vec4f_c; + mlac_float = mlac_float_c; + mlac_vec2f = mlac_vec2f_c; + mlac_vec3f = mlac_vec3f_c; + mlac_vec4f = mlac_vec4f_c; + add_float = add_float_c; + sub_float = sub_float_c; + mul_float = mul_float_c; + div_float = div_float_c; + mla_float = mla_float_c; + abs_float = abs_float_c; + len_vec2f = len_vec2f_c; + len_vec3f = len_vec3f_c; + len_vec4f = len_vec4f_c; + normalize_vec2f = normalize_vec2f_c; + normalize_vec3f = normalize_vec3f_c; + normalize_vec4f = normalize_vec4f_c; + } +} + +// These are actual definitions of our function pointers that are declared in inc/NE10.h +arm_result_t (*addc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); +arm_result_t (*addc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*addc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*addc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*subc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); +arm_result_t (*subc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*subc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*subc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*rsbc_float)(arm_float_t * dst, arm_float_t *src, const arm_float_t cst, unsigned int count); +arm_result_t (*rsbc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*rsbc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*rsbc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*mulc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); +arm_result_t (*mulc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*mulc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*mulc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*divc_float)(arm_float_t * dst, arm_float_t * src, const arm_float_t cst, unsigned int count); +arm_result_t (*divc_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*divc_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*divc_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*setc_float)(arm_float_t * dst, const arm_float_t cst, unsigned int count); +arm_result_t (*setc_vec2f)(arm_vec2f_t * dst, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*setc_vec3f)(arm_vec3f_t * dst, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*setc_vec4f)(arm_vec4f_t * dst, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*mlac_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src, const arm_float_t cst, unsigned int count); +arm_result_t (*mlac_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * acc, arm_vec2f_t * src, const arm_vec2f_t * cst, unsigned int count); +arm_result_t (*mlac_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * acc, arm_vec3f_t * src, const arm_vec3f_t * cst, unsigned int count); +arm_result_t (*mlac_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * acc, arm_vec4f_t * src, const arm_vec4f_t * cst, unsigned int count); +arm_result_t (*add_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); +arm_result_t (*sub_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); +arm_result_t (*mul_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); +arm_result_t (*div_float)(arm_float_t * dst, arm_float_t * src1, arm_float_t * src2, unsigned int count); +arm_result_t (*mla_float)(arm_float_t * dst, arm_float_t * acc, arm_float_t * src1, arm_float_t * src2, unsigned int count); +arm_result_t (*abs_float)(arm_float_t * dst, arm_float_t * src, unsigned int count); +arm_result_t (*len_vec2f)(arm_float_t * dst, arm_vec2f_t * src, unsigned int count); +arm_result_t (*len_vec3f)(arm_float_t * dst, arm_vec3f_t * src, unsigned int count); +arm_result_t (*len_vec4f)(arm_float_t * dst, arm_vec4f_t * src, unsigned int count); +arm_result_t (*normalize_vec2f)(arm_vec2f_t * dst, arm_vec2f_t * src, unsigned int count); +arm_result_t (*normalize_vec3f)(arm_vec3f_t * dst, arm_vec3f_t * src, unsigned int count); +arm_result_t (*normalize_vec4f)(arm_vec4f_t * dst, arm_vec4f_t * src, unsigned int count); + diff --git a/NE10_init.h b/NE10_init.h new file mode 100644 index 0000000..7f807aa --- /dev/null +++ b/NE10_init.h @@ -0,0 +1,18 @@ +// COPYRIGHT NOTICE TBD NOT FOR RELEASE + +#include + +#ifndef NE10_init_H +#define NE10_init_H + +/*! + This routine returns NE10_OK if the running platform supports NEON, otherwise it returns NE10_ERR + */ +extern arm_result_t NE10_HasNEON(); + +/*! + This routine initializes all the function pointers defined in "NE10.h" with pointers to ARM NEON or ARM VFP implementations. + */ +extern arm_result_t NE10_init(); + +#endif diff --git a/NE10_test.c b/NE10_test.c new file mode 100644 index 0000000..827e7e7 --- /dev/null +++ b/NE10_test.c @@ -0,0 +1,17 @@ +// COPYRIGHT NOTICE TBD NOT FOR RELEASE + +#include "./inc/NE10.h" +#include "./NE10_init.h" + +// This test code shows you how you can statically embed NE10 in your code + +void main() +{ + printf ( "Going to initialze NE10...\n" ); + + NE10_init(); + + printf ( "NE10 has been initialized.\n" ); + +} + diff --git a/cleanall.sh b/cleanall.sh old mode 100755 new mode 100644 index fb72c51..722e758 --- a/cleanall.sh +++ b/cleanall.sh @@ -21,12 +21,13 @@ PRODUCT_NAME=NE10 -rm *.ex *.a *.o +rm *.ex *.a *.o *.so rm res_*.txt rm .*.swp rm .exp.tmp rm testlog.txt for dir in `find * -maxdepth 0 -type d -name "${PRODUCT_NAME}_*"`; do rm -rf $dir; done; +rm -rf ./java for fl in `find * -maxdepth 0 -type f -name "${PRODUCT_NAME}_*.tgz"`; do rm -rf $fl; done; if [ "$CLS" != "0" ]; then clear diff --git a/doxy.conf b/doxy.conf index 90f4cd9..a19f46c 100644 --- a/doxy.conf +++ b/doxy.conf @@ -818,7 +818,7 @@ HTML_HEADER = # each generated HTML page. If it is left blank doxygen will generate a # standard footer. -HTML_FOOTER = copyright_notice +HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to diff --git a/headers/factor.h b/headers/factor.h index 046f148..e72bddb 100644 --- a/headers/factor.h +++ b/headers/factor.h @@ -19,8 +19,11 @@ */ // Typebuilding MACROs +// - Slight difference between toolchain versions on intrinsics #define FLOAT32_2x3(x1,y1,x2,y2,x3,y3) \ - { x1,y1,x2,y2,x3,y3 } + {{ \ + {x1, y1}, {x2,y2}, {x3,y3} \ + }} // Unit test use this macro to index into their function table // "opc" stands for operation's code (which function), -- 2.7.4