1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
44 #ifndef MAP_WRITECOMBINED
45 #define MAP_WRITECOMBINED 0x10000
53 #include <ia64intrin.h>
59 static __inline void blas_lock(volatile unsigned long *address){
64 while (*address) {YIELDING;};
66 __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n"
67 "cmpxchg4.acq %0=[%2],%1,ar.ccv\n"
68 : "=r"(ret) : "r"(1), "r"(address)
69 : "ar.ccv", "memory");
72 #define BLAS_LOCK_DEFINED
74 static __inline unsigned long rpcc(void) {
77 __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks));
83 static __inline unsigned long stmxcsr(void){
86 __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp));
91 static __inline void ldmxcsr(unsigned long fp) {
93 __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp));
97 #define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory")
101 static __inline void blas_lock(volatile unsigned long *address){
102 while (*address || _InterlockedCompareExchange((volatile int *) address,1,0))
105 #define BLAS_LOCK_DEFINED
107 static __inline unsigned int rpcc(void) {
108 return __getReg(_IA64_REG_AR_ITC);
112 static __inline unsigned int stmxcsr(void) {
113 return __getReg(_IA64_REG_AR_FPSR);
116 static __inline void ldmxcsr(unsigned long fp) {
118 return __setReg(_IA64_REG_AR_FPSR, fp);
123 #define GET_IMAGE(res) __stfd(&res, 9)
125 #define GET_IMAGE(res) __stfs(&res, 9)
130 #define GET_IMAGE_CANCEL
132 #ifdef ENABLE_SSE_EXCEPTION
134 #define IDEBUG_START \
136 unsigned long fp_sse_mode, new_fp_mode; \
137 fp_sse_mode = stmxcsr();\
138 new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\
139 ldmxcsr(new_fp_mode);
142 ldmxcsr(fp_sse_mode); \
153 extern unsigned long blas_quick_divide_table[];
156 static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
159 if (y <= 1) return x;
161 __asm__ __volatile__("setf.sig f6 = %1\n\t"
162 "ldf8 f7 = [%2];;\n\t"
163 "xmpy.hu f6= f6, f7;;\n\t"
164 "getf.sig %0 = f6;;\n"
166 : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7"
172 /* Using Intel Compiler */
173 static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
174 if (y <= 1) return x;
175 return _m64_xmahu(x, blas_quick_divide_table[y], 0);
181 extern unsigned int blas_quick_divide_table[];
183 static __inline int blas_quickdivide(unsigned int x, unsigned int y){
184 if (y <= 1) return x;
185 return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32);
194 #define GEMM_NCOPY dgemm_ncopy
195 #define GEMM_TCOPY dgemm_tcopy
196 #define ZGEMM_NCOPY zgemm_ncopy
197 #define ZGEMM_TCOPY zgemm_tcopy
198 #define GEMM_KERNEL dgemm_kernel
200 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
201 #define ZGEMM_KERNEL zgemm_kernel_n
203 #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
204 #define ZGEMM_KERNEL zgemm_kernel_l
206 #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
207 #define ZGEMM_KERNEL zgemm_kernel_r
209 #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
210 #define ZGEMM_KERNEL zgemm_kernel_b
214 #define GEMM_NCOPY sgemm_ncopy
215 #define GEMM_TCOPY sgemm_tcopy
216 #define ZGEMM_NCOPY cgemm_ncopy
217 #define ZGEMM_TCOPY cgemm_tcopy
218 #define GEMM_KERNEL sgemm_kernel
220 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
221 #define ZGEMM_KERNEL cgemm_kernel_n
223 #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
224 #define ZGEMM_KERNEL cgemm_kernel_l
226 #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
227 #define ZGEMM_KERNEL cgemm_kernel_r
229 #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
230 #define ZGEMM_KERNEL cgemm_kernel_b
239 #define CMP4GE cmp.ge
240 #define CMP4NE cmp.ge
241 #define CMP4EQ cmp.eq
245 #define CMP4GE cmp4.ge
246 #define CMP4NE cmp4.ne
247 #define CMP4EQ cmp4.eq
250 #define HALT mov r0 = 0
257 #define LDFD_T1 ldfe.t1
258 #define LDFD_NT1 ldfe.nt1
259 #define LDFD_NT2 ldfe.nt2
260 #define LDFD_NTA ldfe.nta
261 #define LDFPD_NT1 ldfpe.nt1
262 #define LDFPD_NT2 ldfpe.nt2
263 #define LDFPD_NTA ldfpe.nta
265 #define STFD_NTA stfe.nta
274 #elif defined(DOUBLE)
278 #define LDF8_NT1 ldf8.nt1
279 #define LDF8_NTA ldf8.nta
281 #define STF8_NTA stf8.nta
284 #define LDFD_T1 ldfd.t1
285 #define LDFD_NT1 ldfd.nt1
286 #define LDFD_NT2 ldfd.nt2
287 #define LDFD_NTA ldfd.nta
288 #define LDFPD_NT1 ldfpd.nt1
289 #define LDFPD_NT2 ldfpd.nt2
290 #define LDFPD_NTA ldfpd.nta
292 #define STFD_NTA stfd.nta
305 #define LDF8_NT1 ldfs.nt1
306 #define LDF8_NTA ldfs.nta
308 #define STF8_NTA stfs.nta
311 #define LDFD_T1 ldfs.t1
312 #define LDFD_NT1 ldfs.nt1
313 #define LDFD_NT2 ldfs.nt2
314 #define LDFD_NTA ldfs.nta
315 #define LDFPD_NT1 ldfps.nt1
316 #define LDFPD_NT2 ldfps.nt2
317 #define LDFPD_NTA ldfps.nta
319 #define STFD_NTA stfs.nta
341 #define REALNAME ASMNAME
343 #define REALNAME ASMFNAME
346 #ifdef F_INTERFACE_G77
347 #define RETURN_BY_STACK
350 #ifdef F_INTERFACE_G95
351 #define RETURN_BY_STACK
354 #ifdef F_INTERFACE_GFORT
355 #define RETURN_BY_REGS
358 #ifdef F_INTERFACE_INTEL
359 #define RETURN_BY_STACK
378 alloc out0 = ar.pfs, 8, 0, 4, 0; \
381 addl out3 = @ltoff(.LP0), r1;;; \
382 br.call.sptk.many b0 = _mcount;;
387 #if defined(__linux__) && defined(__ELF__)
388 #define GNUSTACK .section .note.GNU-stack,"",@progbits
397 #define START_ADDRESS 0x20000fc800000000UL
402 #ifdef CONFIG_IA64_PAGE_SIZE_4KB
406 #ifdef CONFIG_IA64_PAGE_SIZE_8KB
411 #define BUFFER_SIZE (128 << 20)
414 #define PAGESIZE (16UL << 10)
416 #define HUGE_PAGESIZE ( 4 << 20)
418 #define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER)