*****************************************************************************/
/**************************************************************************************
-* 2013/10/13 Saar
+* 2013/11/02 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
*
-* 2013/10/13 Saar
+* 2013/11/02 Saar
* UNROLL_N 4
* UNROLL_M 4
* DGEMM_P 128
* DGEMM_Q 240
-* DGEMM_R 4096
-* A_PRE 96
-* B_PRE 96
-* C_PRE 64
+* DGEMM_R 12288
+* A_PRE 128
+* B_PRE 128
+* C_PRE 32
*
-* Performance on Odroid U2:
+* Performance on Odroid U2:
*
-* 1 Core: 2.60 GFLOPS ATLAS: 2.67 GFLOPS
-* 2 Cores: 5.17 GFLOPS ATLAS: 5.25 GFLOPS
-* 3 Cores: 7.60 GFLOPS ATLAS: 7.82 GFLOPS
-* 4 Cores: 9.98 GFLOPS ATLAS: 9.95 GFLOPS
+* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS
+* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS
+* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS
+* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS
**************************************************************************************/
#define ASSEMBLER
#define K1 r7
#define BC r12
-#define A_PRE 96
-#define B_PRE 96
-#define C_PRE 64
+#define A_PRE 128
+#define B_PRE 128
+#define C_PRE 32
/**************************************************************************************
* Macro definitions
.macro KERNEL4x4_I
+ pld [ AO , #A_PRE ]
fldmias AO!, { s0 - s1 }
- pld [ AO , #A_PRE-8 ]
+ pld [ BO , #B_PRE ]
fldmias BO!, { s8 - s9 }
- pld [ BO , #B_PRE-8 ]
fmuls s16 , s0, s8
fldmias AO!, { s2 - s3 }
pld [ AO , #A_PRE ]
fmacs s16 , s4, s12
fmacs s17 , s5, s12
- fldmias AO!, { s0 - s1 }
+ fldmias AO!, { s0 - s3 }
fmacs s18 , s6, s12
pld [ BO , #B_PRE ]
fmacs s19 , s7, s12
fmacs s20 , s4, s13
- fldmias AO!, { s2 - s3 }
+ fldmias BO!, { s8 - s11 }
fmacs s21 , s5, s13
fmacs s22 , s6, s13
- fldmias BO!, { s8 - s9 }
+ //fldmias AO!, { s2 - s3 }
fmacs s23 , s7, s13
fmacs s24 , s4, s14
- fldmias BO!, { s10 - s11 }
+ //fldmias BO!, { s10 - s11 }
fmacs s25 , s5, s14
fmacs s26 , s6, s14
fmacs s27 , s7, s14
.macro KERNEL4x4_M1
fmacs s16 , s0, s8
- fldmias AO!, { s4 - s5 }
+ fldmias AO!, { s4 - s7 }
fmacs s17 , s1, s8
fmacs s18 , s2, s8
- fldmias AO!, { s6 - s7 }
+ fldmias BO!, { s12 - s15 }
+ //fldmias AO!, { s6 - s7 }
fmacs s19 , s3, s8
fmacs s20 , s0, s9
- fldmias BO!, { s12 - s13 }
fmacs s21 , s1, s9
fmacs s22 , s2, s9
- fldmias BO!, { s14 - s15 }
+ //fldmias BO!, { s14 - s15 }
fmacs s23 , s3, s9
fmacs s24 , s0, s10
.macro KERNEL4x4_SUB
flds s8 , [ BO ]
- pld [ BO , #B_PRE ]
flds s0 , [ AO ]
- pld [ AO , #A_PRE ]
flds s1 , [ AO, #4 ]
fmacs s16 , s0, s8
.endm
.macro SAVE4x4
- pld [ CO1 , #C_PRE ]
ldr r3 , LDC
add CO2 , CO1, r3
flds s0, ALPHA
add r4 , CO2, r3
- pld [ CO2 , #C_PRE ]
fldmias CO1, { s8 - s11 }
- pld [ r4 , #C_PRE ]
fmacs s8 , s0 , s16
flds s12, [CO2]
fmacs s15, s0 , s23
fsts s11, [CO1, #12 ]
+ pld [ CO1 , #C_PRE ]
+
fldmias r4, { s8 - s11 }
fmacs s8 , s0 , s24
fmacs s11, s0 , s27
fsts s15, [CO2, #12 ]
+ pld [ CO2 , #C_PRE ]
+
add CO2, r4 , r3
- pld [ CO2 , #C_PRE ]
fldmias CO2, { s12 - s15 }
fsts s11, [r4 , #12 ]
fmacs s15, s0 , s31
+ pld [ r4 , #C_PRE ]
fstmias CO2, { s12 - s15 }
+ pld [ CO2 , #C_PRE ]
add CO1, CO1, #16
mov BO, BC
- asrs L , K1, #3 // L = L / 8
- cmp L , #3
- blt _L4_M4_30
- .align 5
+ asrs L , K1, #1 // L = L / 8
+ cmp L , #2
+ blt _L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
-
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
- sub L, L, #2
+ subs L, L, #2
+ ble _L4_M4_22a
+ .align 5
_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
-
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
subs L, L, #1
bgt _L4_M4_22
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
+_L4_M4_22a:
KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_E
-
- b _L4_M4_44
-
-
-_L4_M4_30:
- tst L, #3
- ble _L4_M4_40
-
- tst L, #2
- ble _L4_M4_32
-
- KERNEL4x4_I
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
-
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
-
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
-
-
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
KERNEL4x4_E
b _L4_M4_44
ble _L4_M4_40
KERNEL4x4_I
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
- KERNEL4x4_M2
- KERNEL4x4_M1
KERNEL4x4_E
b _L4_M4_44
_L4_M4_44:
- ands L , K1, #7 // L = L % 8
+ ands L , K1, #1 // L = L % 8
ble _L4_M4_100
_L4_M4_46:
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
-#define SGEMM_DEFAULT_P 192
+#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 128
#define CGEMM_DEFAULT_P 96
#define ZGEMM_DEFAULT_P 64
-#define SGEMM_DEFAULT_Q 120
+#define SGEMM_DEFAULT_Q 240
#define DGEMM_DEFAULT_Q 120
#define CGEMM_DEFAULT_Q 120
#define ZGEMM_DEFAULT_Q 120
-#define SGEMM_DEFAULT_R 16384
+#define SGEMM_DEFAULT_R 12288
#define DGEMM_DEFAULT_R 8192
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096