1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
44 #ifndef MAP_WRITECOMBINED
45 #define MAP_WRITECOMBINED 0x10000
52 #include <ia64intrin.h>
58 static __inline void blas_lock(volatile unsigned long *address){
63 while (*address) {YIELDING;};
65 __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n"
66 "cmpxchg4.acq %0=[%2],%1,ar.ccv\n"
67 : "=r"(ret) : "r"(1), "r"(address)
68 : "ar.ccv", "memory");
71 #define BLAS_LOCK_DEFINED
73 static __inline unsigned long rpcc(void) {
76 __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks));
82 static __inline unsigned long stmxcsr(void){
85 __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp));
90 static __inline void ldmxcsr(unsigned long fp) {
92 __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp));
96 #define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory")
100 static __inline void blas_lock(volatile unsigned long *address){
101 while (*address || _InterlockedCompareExchange((volatile int *) address,1,0))
104 #define BLAS_LOCK_DEFINED
106 static __inline unsigned int rpcc(void) {
107 return __getReg(_IA64_REG_AR_ITC);
111 static __inline unsigned int stmxcsr(void) {
112 return __getReg(_IA64_REG_AR_FPSR);
115 static __inline void ldmxcsr(unsigned long fp) {
117 return __setReg(_IA64_REG_AR_FPSR, fp);
122 #define GET_IMAGE(res) __stfd(&res, 9)
124 #define GET_IMAGE(res) __stfs(&res, 9)
129 #define GET_IMAGE_CANCEL
131 #ifdef ENABLE_SSE_EXCEPTION
133 #define IDEBUG_START \
135 unsigned long fp_sse_mode, new_fp_mode; \
136 fp_sse_mode = stmxcsr();\
137 new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\
138 ldmxcsr(new_fp_mode);
141 ldmxcsr(fp_sse_mode); \
152 extern unsigned long blas_quick_divide_table[];
155 static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
158 if (y <= 1) return x;
160 __asm__ __volatile__("setf.sig f6 = %1\n\t"
161 "ldf8 f7 = [%2];;\n\t"
162 "xmpy.hu f6= f6, f7;;\n\t"
163 "getf.sig %0 = f6;;\n"
165 : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7"
171 /* Using Intel Compiler */
172 static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
173 if (y <= 1) return x;
174 return _m64_xmahu(x, blas_quick_divide_table[y], 0);
180 extern unsigned int blas_quick_divide_table[];
182 static __inline int blas_quickdivide(unsigned int x, unsigned int y){
183 if (y <= 1) return x;
184 return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32);
193 #define GEMM_NCOPY dgemm_ncopy
194 #define GEMM_TCOPY dgemm_tcopy
195 #define ZGEMM_NCOPY zgemm_ncopy
196 #define ZGEMM_TCOPY zgemm_tcopy
197 #define GEMM_KERNEL dgemm_kernel
199 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
200 #define ZGEMM_KERNEL zgemm_kernel_n
202 #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
203 #define ZGEMM_KERNEL zgemm_kernel_l
205 #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
206 #define ZGEMM_KERNEL zgemm_kernel_r
208 #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
209 #define ZGEMM_KERNEL zgemm_kernel_b
213 #define GEMM_NCOPY sgemm_ncopy
214 #define GEMM_TCOPY sgemm_tcopy
215 #define ZGEMM_NCOPY cgemm_ncopy
216 #define ZGEMM_TCOPY cgemm_tcopy
217 #define GEMM_KERNEL sgemm_kernel
219 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
220 #define ZGEMM_KERNEL cgemm_kernel_n
222 #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
223 #define ZGEMM_KERNEL cgemm_kernel_l
225 #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
226 #define ZGEMM_KERNEL cgemm_kernel_r
228 #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
229 #define ZGEMM_KERNEL cgemm_kernel_b
238 #define CMP4GE cmp.ge
239 #define CMP4NE cmp.ge
240 #define CMP4EQ cmp.eq
244 #define CMP4GE cmp4.ge
245 #define CMP4NE cmp4.ne
246 #define CMP4EQ cmp4.eq
249 #define HALT mov r0 = 0
256 #define LDFD_T1 ldfe.t1
257 #define LDFD_NT1 ldfe.nt1
258 #define LDFD_NT2 ldfe.nt2
259 #define LDFD_NTA ldfe.nta
260 #define LDFPD_NT1 ldfpe.nt1
261 #define LDFPD_NT2 ldfpe.nt2
262 #define LDFPD_NTA ldfpe.nta
264 #define STFD_NTA stfe.nta
273 #elif defined(DOUBLE)
277 #define LDF8_NT1 ldf8.nt1
278 #define LDF8_NTA ldf8.nta
280 #define STF8_NTA stf8.nta
283 #define LDFD_T1 ldfd.t1
284 #define LDFD_NT1 ldfd.nt1
285 #define LDFD_NT2 ldfd.nt2
286 #define LDFD_NTA ldfd.nta
287 #define LDFPD_NT1 ldfpd.nt1
288 #define LDFPD_NT2 ldfpd.nt2
289 #define LDFPD_NTA ldfpd.nta
291 #define STFD_NTA stfd.nta
304 #define LDF8_NT1 ldfs.nt1
305 #define LDF8_NTA ldfs.nta
307 #define STF8_NTA stfs.nta
310 #define LDFD_T1 ldfs.t1
311 #define LDFD_NT1 ldfs.nt1
312 #define LDFD_NT2 ldfs.nt2
313 #define LDFD_NTA ldfs.nta
314 #define LDFPD_NT1 ldfps.nt1
315 #define LDFPD_NT2 ldfps.nt2
316 #define LDFPD_NTA ldfps.nta
318 #define STFD_NTA stfs.nta
340 #define REALNAME ASMNAME
342 #define REALNAME ASMFNAME
345 #ifdef F_INTERFACE_G77
346 #define RETURN_BY_STACK
349 #ifdef F_INTERFACE_G95
350 #define RETURN_BY_STACK
353 #ifdef F_INTERFACE_GFORT
354 #define RETURN_BY_REGS
357 #ifdef F_INTERFACE_INTEL
358 #define RETURN_BY_STACK
377 alloc out0 = ar.pfs, 8, 0, 4, 0; \
380 addl out3 = @ltoff(.LP0), r1;;; \
381 br.call.sptk.many b0 = _mcount;;
386 #if defined(__linux__) && defined(__ELF__)
387 #define GNUSTACK .section .note.GNU-stack,"",@progbits
396 #define START_ADDRESS 0x20000fc800000000UL
401 #ifdef CONFIG_IA64_PAGE_SIZE_4KB
405 #ifdef CONFIG_IA64_PAGE_SIZE_8KB
410 #define BUFFER_SIZE (128 << 20)
413 #define PAGESIZE (16UL << 10)
415 #define HUGE_PAGESIZE ( 4 << 20)
417 #define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER)