+*.obj
+*.lib
+*.dll
+*.def
*.o
lapack-3.1.1
lapack-3.1.1.tgz
*.so
*.a
+.svn
*~
config.h
Makefile.conf
OpenBLAS ChangeLog
====================================================================
-Version 0.1 alpha2(in development)
+Version 0.1 alpha2
+23-Jun-2011
common:
- *
+ * Fixed blasint undefined bug in <cblas.h> file. Other software
+ could include this header successfully(Refs issue #13 on github)
+ * Fixed the SEGFAULT bug on 64 cores. On SMP server, the number
+ of CPUs or cores should be less than or equal to 64.(Refs issue #14
+ on github)
+ * Support "void goto_set_num_threads(int num_threads)" and "void
+ openblas_set_num_threads(int num_threads)" when USE_OPENMP=1
+ * Added extern "C" to support C++. Thank Tasio for the patch(Refs
+ issue #21 on github)
+ * Provided an error message when the arch is not supported.(Refs
+ issue #19 on github)
+ * Fixed issue #23. Fixed a bug of f_check script about generating link flags.
+ * Added openblas_set_num_threads for Fortran.
+ * Fixed #25 a wrong result of rotmg.
+ * Fixed a bug about detecting underscore prefix in c_check.
+ * Print the wall time (cycles) with enabling FUNCTION_PROFILE
+ * Fixed #35 a build bug with NO_LAPACK=1 & DYNAMIC_ARCH=1
+ * Added install target. You can use "make install". (Refs #20)
+
+
x86/x86_64:
- *
+ * Fixed #28 a wrong result of dsdot on x86_64.
+ * Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
+ * Fixed #33 ztrmm bug on Nehalem.
+ * Walk round #27 the low performance axpy issue with small imput size & multithreads.
+
MIPS64:
- *
+ * Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
+ * Optimized single/double precision BLAS Level3 on Loongson3A/MIPS64. (Refs #2)
+ * Optimized single/double precision axpy function on Loongson3A/MIPS64. (Refs #3)
+
====================================================================
Version 0.1 alpha1
20-Mar-2011
BLASDIRS += reference
endif
+ifndef PREFIX
+PREFIX = /opt/OpenBLAS
+endif
+
SUBDIRS = $(BLASDIRS)
ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
-.PHONY : all libs netlib test ctest shared
-.NOTPARALLEL : all libs prof lapack-test
+.PHONY : all libs netlib test ctest shared install
+.NOTPARALLEL : all libs prof lapack-test install
all :: libs netlib tests shared
@echo
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
-# -ln -fs $(LIBDLLNAME) libopenblas.dll
+ -ln -fs $(LIBDLLNAME) libopenblas.dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
endif
libs :
+ifeq ($(CORE), UNKOWN)
+ $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
+endif
-ln -fs $(LIBNAME) libopenblas.$(LIBSUFFIX)
for d in $(SUBDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
+#Save the config files for installation
+ cp Makefile.conf Makefile.conf_last
+ cp config.h config_last.h
ifdef DYNAMIC_ARCH
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
+ echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif
+ touch lib.grd
prof : prof_blas prof_lapack
dummy :
+install :
+ $(MAKE) -f Makefile.install install
+
clean ::
@for d in $(SUBDIRS_ALL) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
-ifdef DYNAMIC_ARCH
+#ifdef DYNAMIC_ARCH
@$(MAKE) -C kernel clean
-endif
+#endif
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libopenblas.$(LIBSUFFIX) libopenblas_p.$(LIBSUFFIX) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d lapack-3.1.1; then \
echo deleting lapack-3.1.1; \
rm -rf lapack-3.1.1 ;\
fi
+ @rm -f *.grd Makefile.conf_last config_last.h
@echo Done.
\ No newline at end of file
--- /dev/null
+TOPDIR = .
+export GOTOBLAS_MAKEFILE = 1
+-include $(TOPDIR)/Makefile.conf_last
+include ./Makefile.system
+
+.PHONY : install
+.NOTPARALLEL : install
+
+lib.grd :
+ $(error OpenBLAS: Please run "make" firstly)
+
+install : lib.grd
+ @-mkdir -p $(PREFIX)
+ @echo Generating openblas_config.h in $(PREFIX)
+#for inc
+ @echo \#ifndef OPENBLAS_CONFIG_H > $(PREFIX)/openblas_config.h
+ @echo \#define OPENBLAS_CONFIG_H >> $(PREFIX)/openblas_config.h
+ @cat config_last.h >> $(PREFIX)/openblas_config.h
+ @echo \#define VERSION \" OpenBLAS $(VERSION) \" >> $(PREFIX)/openblas_config.h
+ @cat openblas_config_template.h >> $(PREFIX)/openblas_config.h
+ @echo \#endif >> $(PREFIX)/openblas_config.h
+
+ @echo Generating f77blas.h in $(PREFIX)
+ @echo \#ifndef OPENBLAS_F77BLAS_H > $(PREFIX)/f77blas.h
+ @echo \#define OPENBLAS_F77BLAS_H >> $(PREFIX)/f77blas.h
+ @echo \#include \"openblas_config.h\" >> $(PREFIX)/f77blas.h
+ @cat common_interface.h >> $(PREFIX)/f77blas.h
+ @echo \#endif >> $(PREFIX)/f77blas.h
+
+ @echo Generating cblas.h in $(PREFIX)
+ @sed 's/common/openblas_config/g' cblas.h > $(PREFIX)/cblas.h
+
+#for install static library
+ @echo Copy the static library to $(PREFIX)
+ @cp $(LIBNAME) $(PREFIX)
+ @-ln -fs $(PREFIX)/$(LIBNAME) $(PREFIX)/libopenblas.$(LIBSUFFIX)
+#for install shared library
+ @echo Copy the shared library to $(PREFIX)
+ifeq ($(OSNAME), Linux)
+ -cp $(LIBSONAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
+endif
+ifeq ($(OSNAME), FreeBSD)
+ -cp $(LIBSONAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
+endif
+ifeq ($(OSNAME), NetBSD)
+ -cp $(LIBSONAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBSONAME) $(PREFIX)/libopenblas.so
+endif
+ifeq ($(OSNAME), Darwin)
+ -cp $(LIBDYNNAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBDYNNAME) $(PREFIX)/libopenblas.dylib
+endif
+ifeq ($(OSNAME), WINNT)
+ -cp $(LIBDLLNAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
+endif
+ifeq ($(OSNAME), CYGWIN_NT)
+ -cp $(LIBDLLNAME) $(PREFIX)
+ -ln -fs $(PREFIX)/$(LIBDLLNAME) $(PREFIX)/libopenblas.dll
+endif
+
+ @echo Install OK!
+
# SANITY_CHECK to compare the result with reference BLAS.
# UTEST_CHECK = 1
+# The installation directory.
+# PREFIX = /opt/OpenBLAS
+
# Common Optimization Flag; -O2 is enough.
# DEBUG = 1
GETARCH_FLAGS += -DFORCE_$(TARGET)
endif
+ifdef INTERFACE64
+GETARCH_FLAGS += -DUSE64BITINT
+endif
+
# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
CCOMMON_OPT += -wd981
endif
-ifdef USE_OPENMP
+ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fopenmp
endif
ifdef BINARY64
ifdef INTERFACE64
-CCOMMON_OPT += -DUSE64BITINT
+CCOMMON_OPT +=
+#-DUSE64BITINT
endif
endif
CCOMMON_OPT += -DDYNAMIC_ARCH
endif
+ifeq ($(NO_LAPACK), 1)
+CCOMMON_OPT += -DNO_LAPACK
+endif
+
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
Or,
check out codes from git://github.com/xianyi/OpenBLAS.git
1)Normal compile
-Please read GotoBLAS_02QuickInstall.txt or type "make"
+ (a) type "make" to detect the CPU automatically.
+ or
+ (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
2)Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
3)Debug version
make DEBUG=1
+4)Intall to the directory (Optional)
+e.g.
+make install PREFIX=your_installation_directory
+The default directory is /opt/OpenBLAS
+
3.Support CPU & OS
Please read GotoBLAS_01Readme.txt
or
export OMP_NUM_THREADS=4
-The priorities are OPENBLAS_NUM_THREAD > GOTO_NUM_THREADS > OMP_NUM_THREADS.
+The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
+
+If you compile this lib with USE_OPENMP=1, you should only set OMP_NUM_THREADS environment variable.
4.2 Set the number of threads with calling functions. for example,
void goto_set_num_threads(int num_threads);
or
void openblas_set_num_threads(int num_threads);
+If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
+
5.Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
8.ChangeLog
-Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
\ No newline at end of file
+Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
+
+9.Known Issues
+* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
+ is 64. On 32 bits, it is 32.
+* This library is not compatible with EKOPath Compiler Suite 4.0.10 (http://www.pathscale.com/ekopath-compiler-suite). However, Path64 (https://github.com/path64/compiler) could compile the codes successfully.
+
+10. Specification of Git Branches
+We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
+Now, there are 4 branches in github.com.
+ * The master branch. This a main branch to reflect a production-ready state.
+ * The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
+ * The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
+ * The gh-pages branch. This is for web pages
--- /dev/null
+Force Target Examples:
+
+make TARGET=NEHALEM
+make TARGET=LOONGSON3A BINARY=64
+make TARGET=ISTANBUL
+
+Supported List:
+1.X86/X86_64
+a)Intel CPU:
+P2
+COPPERMINE
+KATMAI
+NORTHWOOD
+PRESCOTT
+BANIAS
+YONAH
+CORE2
+PENRYN
+DUNNINGTON
+NEHALEM
+ATOM
+
+b)AMD CPU:
+ATHLON
+OPTERON
+OPTERON_SSE3
+BARCELONA
+SHANGHAI
+ISTANBUL
+
+c)VIA CPU:
+SSE_GENERIC
+VIAC3
+NANO
+
+2.Power CPU:
+POWER4
+POWER5
+POWER6
+PPCG4
+PPC970
+PPC970MP
+PPC440
+PPC440FP2
+CELL
+
+3.MIPS64 CPU:
+SICORTEX
+LOONGSON3A
+
+4.IA64 CPU:
+ITANIUM2
+
+5.SPARC CPU:
+SPARC
+SPARCV7
+
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
-$data =~ /globl\ ([_\.]*)(.*)/;
+$data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;
#ifndef CBLAS_H
#define CBLAS_H
+#ifdef __cplusplus
+extern "C" {
+ /* Assume C declarations for C++ */
+#endif /* __cplusplus */
+
+#include <stddef.h>
+#include "common.h"
+
#define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
+
+#ifdef __cplusplus
+}
+
+#endif /* __cplusplus */
+
#endif
#ifndef COMMON_H
#define COMMON_H
+#ifdef __cplusplus
+extern "C" {
+ /* Assume C declarations for C++ */
+#endif /* __cplusplus */
+
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME)
#endif
+#ifdef __cplusplus
+}
+
+#endif /* __cplusplus */
+
#endif
double _Complex BLASFUNC_REF(zdotu) (blasint *, double *, blasint *, double *, blasint *);
double _Complex BLASFUNC_REF(zdotc) (blasint *, double *, blasint *, double *, blasint *);
+void BLASFUNC_REF(drotmg)(double *, double *, double *, double *, double *);
+
+double BLASFUNC_REF(dsdot)(blasint *, float *, blasint *, float *, blasint*);
+
#endif
case 13:
return CORE_DUNNINGTON;
}
- break;
- case 2:
- switch (model) {
- case 5:
- //Intel Core (Clarkdale) / Core (Arrandale)
- // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
- // Xeon (Clarkdale), 32nm
- return CORE_NEHALEM;
- case 12:
- //Xeon Processor 5600 (Westmere-EP)
- return CORE_NEHALEM;
- }
- break;
-
+ break;
+ case 2:
+ switch (model) {
+ case 5:
+ //Intel Core (Clarkdale) / Core (Arrandale)
+ // Pentium (Clarkdale) / Pentium Mobile (Arrandale)
+ // Xeon (Clarkdale), 32nm
+ return CORE_NEHALEM;
+ case 12:
+ //Xeon Processor 5600 (Westmere-EP)
+ return CORE_NEHALEM;
+ }
+ break;
}
+ break;
+
case 15:
- if (model <= 0x2) return CORE_NORTHWOOD;
- return CORE_PRESCOTT;
+ if (model <= 0x2) return CORE_NORTHWOOD;
+ else return CORE_PRESCOTT;
}
}
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
ifdef SMP
-COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
+COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
ifndef NO_AFFINITY
COMMONOBJS += init.$(SUFFIX)
endif
blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h
$(CC) $(CFLAGS) -c $< -o $(@F)
+openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)
#include <stdio.h>
#include <stdlib.h>
-#include <sys/mman.h>
+//#include <sys/mman.h>
#include "common.h"
#ifndef USE_OPENMP
int blas_server_avail = 0;
+void goto_set_num_threads(int num_threads) {
+
+ if (num_threads < 1) num_threads = blas_num_threads;
+
+ if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
+
+ if (num_threads > blas_num_threads) {
+ blas_num_threads = num_threads;
+ }
+
+ blas_cpu_number = num_threads;
+
+ omp_set_num_threads(blas_cpu_number);
+
+}
+void openblas_set_num_threads(int num_threads) {
+
+ goto_set_num_threads(num_threads);
+}
+
int blas_thread_init(void){
blas_get_cpu_number();
return count;
}
+/***
+ Known issue: The number of CPUs/cores should less
+ than sizeof(unsigned long). On 64 bits, the limit
+ is 64. On 32 bits, it is 32.
+***/
static inline unsigned long get_cpumap(int node) {
int infile;
unsigned long affinity;
char name[160];
+ char cpumap[160];
char *p, *dummy;
-
+ int i=0;
+
sprintf(name, CPUMAP_NAME, node);
infile = open(name, O_RDONLY);
if (infile != -1) {
- read(infile, name, sizeof(name));
-
+ read(infile, cpumap, sizeof(cpumap));
+ p = cpumap;
+ while (*p != '\n' && i<160){
+ if(*p != ',') {
+ name[i++]=*p;
+ }
+ p++;
+ }
p = name;
- while ((*p == '0') || (*p == ',')) p++;
+ // while ((*p == '0') || (*p == ',')) p++;
- affinity = strtol(p, &dummy, 16);
+ affinity = strtoul(p, &dummy, 16);
close(infile);
}
unsigned long share;
int cpu;
- common -> avail = (1UL << common -> num_procs) - 1;
+ if(common->num_procs > 64){
+ fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs);
+ exit(1);
+ }else if(common->num_procs == 64){
+ common -> avail = 0xFFFFFFFFFFFFFFFFUL;
+ }else
+ common -> avail = (1UL << common -> num_procs) - 1;
#ifdef DEBUG
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail);
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
#endif
- lprocmask = (1UL << common -> final_num_procs) - 1;
+ if(common->final_num_procs > 64){
+ fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs);
+ exit(1);
+ }else if(common->final_num_procs == 64){
+ lprocmask = 0xFFFFFFFFFFFFFFFFUL;
+ }else
+ lprocmask = (1UL << common -> final_num_procs) - 1;
#ifndef USE_OPENMP
lprocmask &= *(unsigned long *)&cpu_orig_mask[0];
--- /dev/null
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+#ifdef SMP_SERVER
+#ifdef OS_LINUX
+
+extern void openblas_set_num_threads(int num_threads) ;
+
+void NAME(int* num_threads){
+ openblas_set_num_threads(*num_threads);
+}
+
+#endif
+#endif
if (cycles > 0) {
fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n");
- fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n");
+ fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle Wall Time(Cycles)\n");
for (i = 0; i < MAX_PROF_TABLE; i ++) {
if (function_profile_table[i].calls) {
#ifndef OS_WINDOWS
- fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n",
+ fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f %Ld\n",
#else
- fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n",
+ fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f %lld\n",
#endif
func_table[i],
function_profile_table[i].calls,
(double)function_profile_table[i].cycles / (double)cycles * 100.,
(double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100.,
- (double)function_profile_table[i].area / (double)function_profile_table[i].cycles
+ (double)function_profile_table[i].area / (double)function_profile_table[i].cycles,
+ function_profile_table[i].cycles
);
}
}
zip : dll
zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME)
-dll : libgoto2.dll
+dll : ../$(LIBDLLNAME)
+#libgoto2.dll
dll2 : libgoto2_shared.dll
-libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
+../$(LIBDLLNAME) : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)
- $(DLLWRAP) -o $(@F) --def libgoto2.def \
+ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:i386 /def:libgoto2.def
else
- $(DLLWRAP) -o $(@F) --def libgoto2.def \
+ $(DLLWRAP) -o ../$(LIBDLLNAME) --def libgoto2.def \
--entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB)
-lib /machine:X64 /def:libgoto2.def
endif
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) > $(@F)
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
- $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
+ $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
symbol.$(SUFFIX) : symbol.S
$(CC) $(CFLAGS) -c -o $(@F) $^
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
+ && ($flags !~ /^\-l$/)
) {
$linker_l .= $flags . " ";
}
#ifndef POWER
#define POWER
#endif
+#define OPENBLAS_SUPPORTED
#endif
#if defined(__i386__) || (__x86_64__)
#include "cpuid_x86.c"
+#define OPENBLAS_SUPPORTED
#endif
#ifdef __ia64__
#include "cpuid_ia64.c"
+#define OPENBLAS_SUPPORTED
#endif
#ifdef __alpha
#include "cpuid_alpha.c"
+#define OPENBLAS_SUPPORTED
#endif
#ifdef POWER
#include "cpuid_power.c"
+#define OPENBLAS_SUPPORTED
#endif
#ifdef sparc
#include "cpuid_sparc.c"
+#define OPENBLAS_SUPPORTED
#endif
#ifdef __mips__
#include "cpuid_mips.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
+#ifndef OPENBLAS_SUPPORTED
+#error "This arch/CPU is not supported by OpenBLAS."
#endif
#else
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double)));
+
+#ifdef USE64BITINT
+ printf("#define USE64BITINT\n");
+#endif
}
return 0;
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
-
+
+ //Temporarily walk around the low performance issue with small imput size & multithreads.
+ if (n <= 10000)
+ nthreads = 1;
+
if (nthreads == 1) {
#endif
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
+ double ret = 0.0;
PRINT_DEBUG_NAME;
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
- return DSDOT_K(n, x, incx, y, incy);
+ ret=DSDOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, n, n);
IDEBUG_END;
- return 0;
+ return ret;
}
#else
double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){
+
+ double ret = 0.0;
PRINT_DEBUG_CNAME;
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
- return DSDOT_K(n, x, incx, y, incy);
+ ret=DSDOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, n, n);
IDEBUG_END;
- return 0;
+ return ret;
}
#define GAMSQ 16777216.e0
#define RGAMSQ 5.9604645e-8
+#ifdef DOUBLE
+#define ABS(x) fabs(x)
+#else
+#define ABS(x) fabsf(x)
+#endif
+
#ifndef CBLAS
void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){
dq2 = dp2 * dy1;
dq1 = dp1 * *dx1;
- if (! (abs(dq1) > abs(dq2))) goto L40;
+ if (! (ABS(dq1) > ABS(dq2))) goto L40;
dh21 = -(dy1) / *dx1;
dh12 = dp2 / dp1;
goto L130;
L160:
- if (! (abs(*dd2) <= RGAMSQ)) {
+ if (! (ABS(*dd2) <= RGAMSQ)) {
goto L190;
}
if (*dd2 == ZERO) {
goto L160;
L190:
- if (! (abs(*dd2) >= GAMSQ)) {
+ if (! (ABS(*dd2) >= GAMSQ)) {
goto L220;
}
igo = 3;
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif
+KERNEL_INTERFACE = ../common_level1.h ../common_level2.h ../common_level3.h
+ifneq ($(NO_LAPACK), 1)
+KERNEL_INTERFACE += ../common_lapack.h
+endif
+
ifeq ($(ARCH), x86)
COMMONOBJS += cpuid.$(SUFFIX)
endif
setparam$(TSUFFIX).c : setparam-ref.c
sed 's/TS/$(TSUFFIX)/g' $< > $(@F)
-kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h
+kernel$(TSUFFIX).h : $(KERNEL_INTERFACE)
sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F)
+
cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(CFLAGS) $< -o $(@F)
cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S
$(CC) -c $(PFLAGS) $< -o $(@F)
-ifdef DYNAMIC_ARCH
+#ifdef DYNAMIC_ARCH
clean ::
@rm -f setparam_*.c kernel_*.h setparam.h kernel.h
-endif
+#endif
include $(TOPDIR)/Makefile.tail
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
- $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DDSDOT $< -o $@
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
.align 3
.L999:
- j $31
ADD s1, s1, s2
-
+#ifdef DSDOT
+ cvt.d.s s1, s1
+#endif
+ j $31
+ NOP
+
EPILOGUE
#endif
ssymm_outcopyTS, ssymm_oltcopyTS,
+#ifndef NO_LAPACK
sneg_tcopyTS, slaswp_ncopyTS,
+#else
+ NULL,NULL,
+#endif
0, 0, 0,
DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
#endif
dsymm_outcopyTS, dsymm_oltcopyTS,
+#ifndef NO_LAPACK
dneg_tcopyTS, dlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#ifdef EXPRECISION
#endif
qsymm_outcopyTS, qsymm_oltcopyTS,
+#ifndef NO_LAPACK
qneg_tcopyTS, qlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#endif
chemm3m_oucopyrTS, chemm3m_olcopyrTS,
chemm3m_oucopyiTS, chemm3m_olcopyiTS,
+#ifndef NO_LAPACK
cneg_tcopyTS, claswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
0, 0, 0,
ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
+#ifndef NO_LAPACK
zneg_tcopyTS, zlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#ifdef EXPRECISION
xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
+#ifndef NO_LAPACK
xneg_tcopyTS, xlaswp_ncopyTS,
+#else
+ NULL, NULL,
+#endif
#endif
popl %ebx
popl %esi
popl %edi
+/*remove the hidden return value address from the stack.*/
+ popl %ecx
+ xchgl %ecx, 0(%esp)
ret
EPILOGUE
haddps %xmm0, %xmm0
#endif
+#ifdef DSDOT
+ cvtss2sd %xmm0, %xmm0
+#endif
+
RESTOREREGISTERS
ret
jg .L11
#if defined(TRMMKERNEL) && !defined(LEFT)
- addq $1, KK
+ addq $4, KK
#endif
leaq (C, LDC, 4), C
jg .L11
#if defined(TRMMKERNEL) && !defined(LEFT)
- addq $1, KK
+ addq $4, KK
#endif
leaq (C, LDC, 4), C
--- /dev/null
+/*This is only for "make install" target.*/
+
+#ifdef NEEDBUNDERSCORE
+#define BLASFUNC(FUNC) FUNC##_
+#else
+#define BLASFUNC(FUNC) FUNC
+#endif
+
+#if defined(OS_WINDOWS) && defined(__64BIT__)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef USE64BITINT
+typedef BLASLONG blasint;
+#else
+typedef int blasint;
+#endif
ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS)
XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
+ifneq ($(NO_LAPACK), 1)
+
SBLASOBJS += \
sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \
spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \
xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \
xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \
+endif
include $(TOPDIR)/Makefile.tail
TARGET=openblas_utest
CUNIT_LIB=/usr/local/lib/libcunit.a
-OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o
+OBJS=main.o test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o
all : run_test
$(TARGET): $(OBJS)
- $(CC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
+ $(FC) -o $@ $^ ../$(LIBNAME) $(CUNIT_LIB) $(EXTRALIB)
run_test: $(TARGET)
./$(TARGET)
void test_zdotu_n_1(void);
void test_zdotu_offset_1(void);
+void test_drotmg(void);
+
+void test_dsdot_n_1(void);
+
#endif
{"Testing zdotu with n == 1",test_zdotu_n_1},
{"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1},
-
+
+ {"Testing drotmg",test_drotmg},
+
+ {"Testing dsdot with n == 1",test_dsdot_n_1},
CU_TEST_INFO_NULL,
};
--- /dev/null
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common_utest.h"
+
+void test_dsdot_n_1()
+{
+ float x= 0.172555164;
+ float y= -0.0138700781;
+ int incx=1;
+ int incy=1;
+ int n=1;
+
+ double res1=0.0f, res2=0.0f;
+
+ res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy);
+ res2=BLASFUNC_REF(dsdot)(&n, &x, &incx, &y, &incy);
+
+ CU_ASSERT_DOUBLE_EQUAL(res1, res2, CHECK_EPS);
+
+}
--- /dev/null
+/*****************************************************************************
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common_utest.h"
+
+void test_drotmg()
+{
+ double te_d1, tr_d1;
+ double te_d2, tr_d2;
+ double te_x1, tr_x1;
+ double te_y1, tr_y1;
+ double te_param[5],tr_param[5];
+ int i=0;
+ te_d1= tr_d1=0.21149573940783739;
+ te_d2= tr_d2=0.046892057172954082;
+ te_x1= tr_x1=-0.42272687517106533;
+ te_y1= tr_y1=0.42211309121921659;
+ //OpenBLAS
+ BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+ //reference
+ BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param);
+
+ CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS);
+ CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS);
+ CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS);
+ CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS);
+
+ for(i=0; i<5; i++){
+ CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS);
+ }
+}