#ifndef ASSEMBLER
-static void INLINE blas_lock(volatile unsigned long *address){
+static void __inline blas_lock(volatile BLASULONG *address){
+
+ int register ret;
-// long int ret, val = 1;
-/*
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
- "1: ll %0, %3\n"
- " ori %2, %0, 1\n"
- " sc %2, %1\n"
- " beqz %2, 1b\n"
- " andi %2, %0, 1\n"
- " sync\n"
- : "=&r" (val), "=m" (address), "=&r" (ret)
- : "m" (address)
- : "memory");
+ "ldrex r2, [%1] \n\t"
+ "mov r2, #0 \n\t"
+ "strex r3, r2, [%1] \n\t"
+ "mov %0 , r3 \n\t"
+ : "=r"(ret), "=r"(address)
+ : "1"(address)
+ : "memory", "r2" , "r3"
+
+
+ );
} while (ret);
-*/
+
}
-static inline unsigned int rpcc(void){
- unsigned long ret=0;
+static inline BLASULONG rpcc(void){
+ BLASULONG ret=0;
+ struct timeval tv;
+ gettimeofday(&tv,NULL);
+ ret=1000000* tv.tv_sec + tv.tv_usec;
return ret;
}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/10/11 Saar
+* BLASTEST : xOK
+* CTEST : xOK
+* TEST : xOK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define OLD_M r0
+#define OLD_N r1
+#define OLD_A r2
+#define OLD_LDA r3
+
+
+/******************************************************
+* [fp, #-128] - [fp, #-64] is reserved
+* for store and restore of floating point
+* registers
+*******************************************************/
+
+#define LDA [fp, #-260 ]
+
+#define B [fp, #4 ]
+
+#define M r0
+#define N r1
+#define A r2
+
+#define BO r5
+
+#define AO1 r6
+#define AO2 r7
+#define AO3 r8
+#define AO4 r9
+
+#define I r3
+#define J r12
+
+#define A_PRE 96
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro COPY4x4
+
+ flds s0 , [ AO1, #0 ]
+ flds s1 , [ AO2, #0 ]
+ flds s2 , [ AO3, #0 ]
+ flds s3 , [ AO4, #0 ]
+
+ flds s4 , [ AO1, #4 ]
+ flds s8 , [ AO1, #8 ]
+ flds s12, [ AO1, #12 ]
+
+ flds s5 , [ AO2, #4 ]
+ add AO1, AO1, #16
+ flds s9 , [ AO2, #8 ]
+ flds s13, [ AO2, #12 ]
+
+ flds s6 , [ AO3, #4 ]
+ add AO2, AO2, #16
+ flds s10, [ AO3, #8 ]
+ flds s14, [ AO3, #12 ]
+
+ flds s7 , [ AO4, #4 ]
+ add AO3, AO3, #16
+ flds s11, [ AO4, #8 ]
+ flds s15, [ AO4, #12 ]
+
+ fstmias BO!, { s0 - s3 }
+ add AO4, AO4, #16
+ fstmias BO!, { s4 - s7 }
+ fstmias BO!, { s8 - s15 }
+
+.endm
+
+.macro COPY1x4
+
+ flds s0 , [ AO1, #0 ]
+ flds s1 , [ AO2, #0 ]
+ add AO1, AO1, #4
+ flds s2 , [ AO3, #0 ]
+ add AO2, AO2, #4
+ flds s3 , [ AO4, #0 ]
+
+ add AO3, AO3, #4
+ fstmias BO!, { s0 - s3 }
+ add AO4, AO4, #4
+
+.endm
+
+.macro COPY4x2
+
+ flds s0 , [ AO1, #0 ]
+ flds s2 , [ AO1, #4 ]
+ flds s4 , [ AO1, #8 ]
+ flds s6 , [ AO1, #12 ]
+
+ flds s1 , [ AO2, #0 ]
+ flds s3 , [ AO2, #4 ]
+ add AO1, AO1, #16
+ flds s5 , [ AO2, #8 ]
+ flds s7 , [ AO2, #12 ]
+
+ fstmias BO!, { s0 - s7 }
+ add AO2, AO2, #16
+
+.endm
+
+
+.macro COPY1x2
+
+ flds s0 , [ AO1, #0 ]
+ flds s1 , [ AO2, #0 ]
+ add AO1, AO1, #4
+
+ fstmias BO!, { s0 - s1 }
+ add AO2, AO2, #4
+
+.endm
+
+.macro COPY4x1
+
+ flds s0 , [ AO1, #0 ]
+ flds s1 , [ AO1, #4 ]
+ flds s2 , [ AO1, #8 ]
+ flds s3 , [ AO1, #12 ]
+
+ fstmias BO!, { s0 - s3 }
+ add AO1, AO1, #16
+
+.endm
+
+
+.macro COPY1x1
+
+ flds s0 , [ AO1, #0 ]
+
+ fstmias BO!, { s0 }
+ add AO1, AO1, #4
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+
+ push {r4 - r9, fp}
+ add fp, sp, #24
+ sub sp, sp, #STACKSIZE // reserve stack
+
+
+ lsl r3, r3, #2 // lda = lda * 4
+ str r3, LDA
+
+ sub r4, fp, #128
+ vstm r4, { s8 - s15} // store floating point registers
+
+ ldr BO, B
+
+_L4_BEGIN:
+
+ asrs J, N, #2 // J = N / 4
+ ble _L2_BEGIN
+
+_L4_M4_BEGIN:
+
+ mov AO1, A // AO1 = A
+ ldr r4 , LDA
+ add AO2, AO1, r4
+ add AO3, AO2, r4
+ add AO4, AO3, r4
+ add A , AO4, r4 // A = A + 4 * LDA
+
+ asrs I, M, #2 // I = M / 4
+ ble _L4_M4_40
+
+_L4_M4_20:
+
+ COPY4x4
+
+ subs I , I , #1
+ bne _L4_M4_20
+
+
+_L4_M4_40:
+
+ ands I, M , #3
+ ble _L4_M4_END
+
+_L4_M4_60:
+
+ COPY1x4
+
+ subs I , I , #1
+ bne _L4_M4_60
+
+
+_L4_M4_END:
+
+ subs J , J, #1 // j--
+ bne _L4_M4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+_L2_BEGIN:
+
+ tst N, #3
+ ble _L999
+
+ tst N, #2
+ ble _L1_BEGIN
+
+_L2_M4_BEGIN:
+
+ mov AO1, A // AO1 = A
+ ldr r4 , LDA
+ add AO2, AO1, r4
+ add A , AO2, r4 // A = A + 2 * LDA
+
+ asrs I, M, #2 // I = M / 4
+ ble _L2_M4_40
+
+_L2_M4_20:
+
+ COPY4x2
+
+ subs I , I , #1
+ bne _L2_M4_20
+
+
+_L2_M4_40:
+
+ ands I, M , #3
+ ble _L2_M4_END
+
+_L2_M4_60:
+
+ COPY1x2
+
+ subs I , I , #1
+ bne _L2_M4_60
+
+
+_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+_L1_BEGIN:
+
+ tst N, #1
+ ble _L999
+
+
+_L1_M4_BEGIN:
+
+ mov AO1, A // AO1 = A
+ ldr r4 , LDA
+ add A , AO1, r4 // A = A + 1 * LDA
+
+ asrs I, M, #2 // I = M / 4
+ ble _L1_M4_40
+
+_L1_M4_20:
+
+ COPY4x1
+
+ subs I , I , #1
+ bne _L1_M4_20
+
+
+_L1_M4_40:
+
+ ands I, M , #3
+ ble _L1_M4_END
+
+_L1_M4_60:
+
+ COPY1x1
+
+ subs I , I , #1
+ bne _L1_M4_60
+
+
+_L1_M4_END:
+
+
+
+_L999:
+
+ sub r3, fp, #128
+ vldm r3, { s8 - s15} // restore floating point registers
+
+ movs r0, #0 // set return value
+ sub sp, fp, #24
+ pop {r4 - r9, fp}
+ bx lr
+
+ EPILOGUE
+
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 128
+#define SGEMM_DEFAULT_P 192
#define DGEMM_DEFAULT_P 128
-#define CGEMM_DEFAULT_P 24
+#define CGEMM_DEFAULT_P 96
#define ZGEMM_DEFAULT_P 20
-#define SGEMM_DEFAULT_Q 240
-#define DGEMM_DEFAULT_Q 96
-#define CGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 120
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
#define ZGEMM_DEFAULT_Q 64
-#define SGEMM_DEFAULT_R 4096
-#define DGEMM_DEFAULT_R 512
-#define CGEMM_DEFAULT_R 512
+#define SGEMM_DEFAULT_R 16384
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 512