*****************************************************************************/\r
#define MY_ALIGN .align 3\r
b ZGEMM_L2\r
+/* MINI SUBROUTINES */ \r
+/* 2x8 MAIN 128x+2 LOOP */ \r
\r
-/* MINI SUBROUTINES */\r
\r
-\r
-\r
-/* 2x8 MAIN 128x+1 LOOP */ \r
-ZGEMM_L2x8_LMAIN_SUB: \r
- mtctr L\r
- LOAD2x8 0 \r
- MY_ALIGN\r
+ZGEMM_L2x8_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x8_2 \r
+ MY_ALIGN\r
ZGEMM_L2x8_LOOP:\r
- dcbt AO, PRE\r
- dcbt BO, PRE\r
- KERNEL2x8_L 128,32,0,0 \r
- KERNEL2x8_L 128,32,1,0\r
- dcbt AO, T2 \r
- KERNEL2x8_L 128,32,2,0\r
- KERNEL2x8_L 128,32,3,0 \r
- dcbt AO, T3\r
- dcbt BO, T2\r
- KERNEL2x8_L 128,32,4,0\r
- KERNEL2x8_L 128,32,5,0\r
- dcbt AO, T4 \r
- KERNEL2x8_L 128,32,6,0\r
- KERNEL2x8_L 128,32,7,0 \r
- dcbt AO, T5 \r
- dcbt BO, T3\r
- KERNEL2x8_L 128,32,8,0\r
- KERNEL2x8_L 128,32,9,0\r
- KERNEL2x8_L 128,32,10,0\r
- KERNEL2x8_L 128,32,11,0 \r
- dcbt BO, T4\r
- KERNEL2x8_L 128,32,12,0\r
- KERNEL2x8_L 128,32,13,0\r
- KERNEL2x8_L 128,32,14,0\r
- KERNEL2x8_L 128,32,15,0 \r
- KERNEL2x8_L 128,32,16,0\r
- KERNEL2x8_L 128,32,17,0 \r
- KERNEL2x8_L 128,32,18,0\r
- KERNEL2x8_L 128,32,19,0 \r
- KERNEL2x8_L 128,32,20,0\r
- KERNEL2x8_L 128,32,21,0 \r
- KERNEL2x8_L 128,32,22,0\r
- KERNEL2x8_L 128,32,23,0 \r
- KERNEL2x8_L 128,32,24,0\r
- KERNEL2x8_L 128,32,25,0\r
- KERNEL2x8_L 128,32,26,0\r
- KERNEL2x8_L 128,32,27,0 \r
- KERNEL2x8_L 128,32,28,0\r
- KERNEL2x8_L 128,32,29,0\r
- KERNEL2x8_L 128,32,30,0\r
- KERNEL2x8_L 128,32,31,0 \r
- KERNEL2x8_L 128,32,32,0\r
- KERNEL2x8_L 128,32,33,0\r
- KERNEL2x8_L 128,32,34,0\r
- KERNEL2x8_L 128,32,35,0 \r
- KERNEL2x8_L 128,32,36,0\r
- KERNEL2x8_L 128,32,37,0\r
- KERNEL2x8_L 128,32,38,0\r
- KERNEL2x8_L 128,32,39,0 \r
- KERNEL2x8_L 128,32,40,0\r
- KERNEL2x8_L 128,32,41,0\r
- KERNEL2x8_L 128,32,42,0\r
- KERNEL2x8_L 128,32,43,0 \r
- KERNEL2x8_L 128,32,44,0\r
- KERNEL2x8_L 128,32,45,0\r
- KERNEL2x8_L 128,32,46,0\r
- KERNEL2x8_L 128,32,47,0 \r
- KERNEL2x8_L 128,32,48,0\r
- KERNEL2x8_L 128,32,49,0 \r
- KERNEL2x8_L 128,32,50,0\r
- KERNEL2x8_L 128,32,51,0 \r
- KERNEL2x8_L 128,32,52,0\r
- KERNEL2x8_L 128,32,53,0 \r
- KERNEL2x8_L 128,32,54,0\r
- KERNEL2x8_L 128,32,55,0 \r
- KERNEL2x8_L 128,32,56,0\r
- KERNEL2x8_L 128,32,57,0\r
- KERNEL2x8_L 128,32,58,0\r
- KERNEL2x8_L 128,32,59,0 \r
- KERNEL2x8_L 128,32,60,0\r
- KERNEL2x8_L 128,32,61,0\r
- KERNEL2x8_L 128,32,62,0 \r
- KERNEL2x8_L 128,32,63,1 \r
- bdnz ZGEMM_L2x8_LOOP\r
- MY_ALIGN \r
+/*----------------------------------------*/ \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 256,64,0,0 \r
+ZGEMM_L2x8_K128:\r
+/*----------------------------------------*/ \r
+ KERNEL2x8_L2 256,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 256,64,2,0\r
+ KERNEL2x8_L2 256,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 256,64,4,0\r
+ KERNEL2x8_L2 256,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 256,64,6,0\r
+ KERNEL2x8_L2 256,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L2 256,64,8,0\r
+ KERNEL2x8_L2 256,64,9,0\r
+ KERNEL2x8_L2 256,64,10,0\r
+ KERNEL2x8_L2 256,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L2 256,64,12,0\r
+ KERNEL2x8_L2 256,64,13,0\r
+ KERNEL2x8_L2 256,64,14,0\r
+ KERNEL2x8_L2 256,64,15,0 \r
+ KERNEL2x8_L2 256,64,16,0\r
+ KERNEL2x8_L2 256,64,17,0 \r
+ KERNEL2x8_L2 256,64,18,0\r
+ KERNEL2x8_L2 256,64,19,0 \r
+ KERNEL2x8_L2 256,64,20,0\r
+ KERNEL2x8_L2 256,64,21,0 \r
+ KERNEL2x8_L2 256,64,22,0\r
+ KERNEL2x8_L2 256,64,23,0 \r
+ KERNEL2x8_L2 256,64,24,0\r
+ KERNEL2x8_L2 256,64,25,0\r
+ KERNEL2x8_L2 256,64,26,0\r
+ KERNEL2x8_L2 256,64,27,0 \r
+ KERNEL2x8_L2 256,64,28,0\r
+ KERNEL2x8_L2 256,64,29,0\r
+ KERNEL2x8_L2 256,64,30,0\r
+ KERNEL2x8_L2 256,64,31,0 \r
+ KERNEL2x8_L2 256,64,32,0\r
+ KERNEL2x8_L2 256,64,33,0\r
+ KERNEL2x8_L2 256,64,34,0\r
+ KERNEL2x8_L2 256,64,35,0 \r
+ KERNEL2x8_L2 256,64,36,0\r
+ KERNEL2x8_L2 256,64,37,0\r
+ KERNEL2x8_L2 256,64,38,0\r
+ KERNEL2x8_L2 256,64,39,0 \r
+ KERNEL2x8_L2 256,64,40,0\r
+ KERNEL2x8_L2 256,64,41,0\r
+ KERNEL2x8_L2 256,64,42,0\r
+ KERNEL2x8_L2 256,64,43,0 \r
+ KERNEL2x8_L2 256,64,44,0\r
+ KERNEL2x8_L2 256,64,45,0\r
+ KERNEL2x8_L2 256,64,46,0\r
+ KERNEL2x8_L2 256,64,47,0 \r
+ KERNEL2x8_L2 256,64,48,0\r
+ KERNEL2x8_L2 256,64,49,0 \r
+ KERNEL2x8_L2 256,64,50,0\r
+ KERNEL2x8_L2 256,64,51,0 \r
+ KERNEL2x8_L2 256,64,52,0\r
+ KERNEL2x8_L2 256,64,53,0 \r
+ KERNEL2x8_L2 256,64,54,0\r
+ KERNEL2x8_L2 256,64,55,0 \r
+ KERNEL2x8_L2 256,64,56,0\r
+ KERNEL2x8_L2 256,64,57,0\r
+ KERNEL2x8_L2 256,64,58,0\r
+ KERNEL2x8_L2 256,64,59,0 \r
+ KERNEL2x8_L2 256,64,60,0\r
+ KERNEL2x8_L2 256,64,61,0\r
+ KERNEL2x8_L2 256,64,62,0 \r
+ KERNEL2x8_L2 256,64,63,1 \r
+ bdnz ZGEMM_L2x8_LOOP\r
+ MY_ALIGN \r
ZGEMM_L2x8_LOOP_END:\r
- END2x8 AO, BO, 128,32 \r
- blr\r
-\r
+/*----------------------------------------*/ \r
+ END2x8_2\r
+ blr\r
MY_ALIGN\r
-ZGEMM_2x8_L64_SUB:\r
- LOAD2x8 0 \r
- dcbt AO, PRE\r
- dcbt BO, PRE\r
- KERNEL2x8_L 128,32,0,0 \r
- KERNEL2x8_L 128,32,1,0\r
- dcbt AO, T2 \r
- KERNEL2x8_L 128,32,2,0\r
- KERNEL2x8_L 128,32,3,0 \r
- dcbt AO, T3\r
- dcbt BO, T2\r
- KERNEL2x8_L 128,32,4,0\r
- KERNEL2x8_L 128,32,5,0\r
- dcbt AO, T4 \r
- KERNEL2x8_L 128,32,6,0\r
- KERNEL2x8_L 128,32,7,0 \r
- dcbt AO, T5 \r
- dcbt BO, T3\r
- KERNEL2x8_L 128,32,8,0\r
- KERNEL2x8_L 128,32,9,0\r
- KERNEL2x8_L 128,32,10,0\r
- KERNEL2x8_L 128,32,11,0 \r
- dcbt BO, T4\r
- KERNEL2x8_L 128,32,12,0\r
- KERNEL2x8_L 128,32,13,0\r
- KERNEL2x8_L 128,32,14,0\r
- KERNEL2x8_L 128,32,15,0 \r
- KERNEL2x8_L 128,32,16,0\r
- KERNEL2x8_L 128,32,17,0 \r
- KERNEL2x8_L 128,32,18,0\r
- KERNEL2x8_L 128,32,19,0 \r
- KERNEL2x8_L 128,32,20,0\r
- KERNEL2x8_L 128,32,21,0 \r
- KERNEL2x8_L 128,32,22,0\r
- KERNEL2x8_L 128,32,23,0 \r
- KERNEL2x8_L 128,32,24,0\r
- KERNEL2x8_L 128,32,25,0\r
- KERNEL2x8_L 128,32,26,0\r
- KERNEL2x8_L 128,32,27,0 \r
- KERNEL2x8_L 128,32,28,0\r
- KERNEL2x8_L 128,32,29,0\r
- KERNEL2x8_L 128,32,30,0\r
- KERNEL2x8_E 128,32,31,1\r
- blr\r
\r
\r
+ZGEMM_2x8_L64_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 256,64,0,0 \r
+ KERNEL2x8_L2 256,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 256,64,2,0\r
+ KERNEL2x8_L2 256,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 256,64,4,0\r
+ KERNEL2x8_L2 256,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 256,64,6,0\r
+ KERNEL2x8_L2 256,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L2 256,64,8,0\r
+ KERNEL2x8_L2 256,64,9,0\r
+ KERNEL2x8_L2 256,64,10,0\r
+ KERNEL2x8_L2 256,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L2 256,64,12,0\r
+ KERNEL2x8_L2 256,64,13,0\r
+ KERNEL2x8_L2 256,64,14,0\r
+ KERNEL2x8_L2 256,64,15,0 \r
+ KERNEL2x8_L2 256,64,16,0\r
+ KERNEL2x8_L2 256,64,17,0 \r
+ KERNEL2x8_L2 256,64,18,0\r
+ KERNEL2x8_L2 256,64,19,0 \r
+ KERNEL2x8_L2 256,64,20,0\r
+ KERNEL2x8_L2 256,64,21,0 \r
+ KERNEL2x8_L2 256,64,22,0\r
+ KERNEL2x8_L2 256,64,23,0 \r
+ KERNEL2x8_L2 256,64,24,0\r
+ KERNEL2x8_L2 256,64,25,0\r
+ KERNEL2x8_L2 256,64,26,0\r
+ KERNEL2x8_L2 256,64,27,0 \r
+ KERNEL2x8_L2 256,64,28,0\r
+ KERNEL2x8_L2 256,64,29,0\r
+ KERNEL2x8_L2 256,64,30,0\r
+ KERNEL2x8_E2 256,64,31,1\r
+ blr\r
MY_ALIGN\r
+\r
+\r
ZGEMM_2x8_L32_SUB:\r
- LOAD2x8 0 \r
- dcbt AO, PRE\r
- dcbt BO, PRE\r
- KERNEL2x8_L 128,32,0,0 \r
- KERNEL2x8_L 128,32,1,0\r
- dcbt AO, T2 \r
- KERNEL2x8_L 128,32,2,0\r
- KERNEL2x8_L 128,32,3,0 \r
- dcbt AO, T3\r
- dcbt BO, T2\r
- KERNEL2x8_L 128,32,4,0\r
- KERNEL2x8_L 128,32,5,0\r
- dcbt AO, T4 \r
- KERNEL2x8_L 128,32,6,0\r
- KERNEL2x8_L 128,32,7,0 \r
- dcbt AO, T5 \r
- dcbt BO, T3\r
- KERNEL2x8_L 128,32,8,0\r
- KERNEL2x8_L 128,32,9,0\r
- KERNEL2x8_L 128,32,10,0\r
- KERNEL2x8_L 128,32,11,0 \r
- dcbt BO, T4\r
- KERNEL2x8_L 128,32,12,0\r
- KERNEL2x8_L 128,32,13,0\r
- KERNEL2x8_L 128,32,14,0\r
- KERNEL2x8_L 128,32,15,1\r
- blr\r
+/*----------------------------------------*/ \r
+ LOAD2x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 256,64,0,0 \r
+ KERNEL2x8_L2 256,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 256,64,2,0\r
+ KERNEL2x8_L2 256,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 256,64,4,0\r
+ KERNEL2x8_L2 256,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 256,64,6,0\r
+ KERNEL2x8_L2 256,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L2 256,64,8,0\r
+ KERNEL2x8_L2 256,64,9,0\r
+ KERNEL2x8_L2 256,64,10,0\r
+ KERNEL2x8_L2 256,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L2 256,64,12,0\r
+ KERNEL2x8_L2 256,64,13,0\r
+ KERNEL2x8_L2 256,64,14,0\r
+ KERNEL2x8_E2 256,64,15,1\r
+ blr\r
MY_ALIGN\r
\r
+\r
ZGEMM_2x8_L16_SUB:\r
- LOAD2x8 0 \r
- dcbt AO, PRE\r
- dcbt BO, PRE\r
- KERNEL2x8_L 128,32,0,0 \r
- KERNEL2x8_L 128,32,1,0\r
- dcbt AO, T2 \r
- KERNEL2x8_L 128,32,2,0\r
- KERNEL2x8_L 128,32,3,0 \r
- dcbt AO, T3\r
- dcbt BO, T2\r
- KERNEL2x8_L 128,32,4,0\r
- KERNEL2x8_L 128,32,5,0\r
- dcbt AO, T4 \r
- KERNEL2x8_L 128,32,6,0\r
- KERNEL2x8_L 128,32,7,1\r
- blr\r
- MY_ALIGN\r
+/*----------------------------------------*/ \r
+ LOAD2x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 256,64,0,0 \r
+ KERNEL2x8_L2 256,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 256,64,2,0\r
+ KERNEL2x8_L2 256,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 256,64,4,0\r
+ KERNEL2x8_L2 256,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 256,64,6,0\r
+ KERNEL2x8_E2 256,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
\r
ZGEMM_2x4_LMAIN_SUB:\r
- mtctr L\r
- LOAD2x4 0 \r
- MY_ALIGN\r
-ZGEMM_L2x4_LOOP: \r
- KERNEL2x4_L 64,32,0,0\r
- KERNEL2x4_L 64,32,1,0 \r
- KERNEL2x4_L 64,32,2,0\r
- KERNEL2x4_L 64,32,3,0 \r
- KERNEL2x4_L 64,32,4,0\r
- KERNEL2x4_L 64,32,5,0 \r
- KERNEL2x4_L 64,32,6,0\r
- KERNEL2x4_L 64,32,7,0\r
- KERNEL2x4_L 64,32,8,0\r
- KERNEL2x4_L 64,32,9,0 \r
- KERNEL2x4_L 64,32,10,0\r
- KERNEL2x4_L 64,32,11,0 \r
- KERNEL2x4_L 64,32,12,0\r
- KERNEL2x4_L 64,32,13,0 \r
- KERNEL2x4_L 64,32,14,0\r
- KERNEL2x4_L 64,32,15,1 \r
- bdnz ZGEMM_L2x4_LOOP\r
- MY_ALIGN \r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x4_2 \r
+ MY_ALIGN\r
+ZGEMM_L2x4_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL2x4_L2 128,64,0,0\r
+ZGEMM_L2x4_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL2x4_L2 128,64,1,0 \r
+ KERNEL2x4_L2 128,64,2,0\r
+ KERNEL2x4_L2 128,64,3,0 \r
+ KERNEL2x4_L2 128,64,4,0\r
+ KERNEL2x4_L2 128,64,5,0 \r
+ KERNEL2x4_L2 128,64,6,0\r
+ KERNEL2x4_L2 128,64,7,0\r
+ KERNEL2x4_L2 128,64,8,0\r
+ KERNEL2x4_L2 128,64,9,0 \r
+ KERNEL2x4_L2 128,64,10,0\r
+ KERNEL2x4_L2 128,64,11,0 \r
+ KERNEL2x4_L2 128,64,12,0\r
+ KERNEL2x4_L2 128,64,13,0 \r
+ KERNEL2x4_L2 128,64,14,0\r
+ KERNEL2x4_L2 128,64,15,1 \r
+ bdnz ZGEMM_L2x4_LOOP\r
+ MY_ALIGN \r
ZGEMM_L2x4_LOOP_END:\r
- END2x4 AO, BO, 64,32 \r
- blr\r
-\r
+/*----------------------------------------*/ \r
+ END2x4_2 \r
+ blr\r
MY_ALIGN\r
+\r
+\r
ZGEMM_2x4_L16_SUB:\r
- LOAD2x4 0 \r
- KERNEL2x4_L 64,32, 0,0\r
- KERNEL2x4_L 64,32, 1,0\r
- KERNEL2x4_L 64,32, 2,0\r
- KERNEL2x4_L 64,32, 3,0\r
- KERNEL2x4_L 64,32, 4,0\r
- KERNEL2x4_L 64,32, 5,0\r
- KERNEL2x4_L 64,32, 6,0\r
- KERNEL2x4_E 64,32, 7,1\r
+/*----------------------------------------*/ \r
+ LOAD2x4_2\r
+ KERNEL2x4_L2 128,64,0,0\r
+ KERNEL2x4_L2 128,64,1,0 \r
+ KERNEL2x4_L2 128,64,2,0\r
+ KERNEL2x4_L2 128,64,3,0 \r
+ KERNEL2x4_L2 128,64,4,0\r
+ KERNEL2x4_L2 128,64,5,0 \r
+ KERNEL2x4_L2 128,64,6,0\r
+ KERNEL2x4_E2 128,64,7,1\r
blr\r
-\r
MY_ALIGN\r
+\r
+\r
ZGEMM_2x4_L8_SUB:\r
- LOAD2x4 0 \r
- KERNEL2x4_L 64,32, 0,0\r
- KERNEL2x4_L 64,32, 1,0\r
- KERNEL2x4_L 64,32, 2,0\r
- KERNEL2x4_E 64,32, 3,1\r
+/*----------------------------------------*/ \r
+ LOAD2x4_2\r
+ KERNEL2x4_L2 128,64,0,0\r
+ KERNEL2x4_L2 128,64,1,0 \r
+ KERNEL2x4_L2 128,64,2,0\r
+ KERNEL2x4_E2 128,64,3,1 \r
+ blr\r
+\r
+\r
+ZGEMM_2x2_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x2_2 \r
+ MY_ALIGN \r
+ZGEMM_L2x2_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL2x2_L2 64,64,0,0 \r
+ZGEMM_L2x2_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL2x2_L2 64,64,1,0 \r
+ KERNEL2x2_L2 64,64,2,0\r
+ KERNEL2x2_L2 64,64,3,0 \r
+ KERNEL2x2_L2 64,64,4,0\r
+ KERNEL2x2_L2 64,64,5,0 \r
+ KERNEL2x2_L2 64,64,6,0\r
+ KERNEL2x2_L2 64,64,7,0\r
+ KERNEL2x2_L2 64,64,8,0\r
+ KERNEL2x2_L2 64,64,9,0 \r
+ KERNEL2x2_L2 64,64,10,0\r
+ KERNEL2x2_L2 64,64,11,0 \r
+ KERNEL2x2_L2 64,64,12,0\r
+ KERNEL2x2_L2 64,64,13,0 \r
+ KERNEL2x2_L2 64,64,14,0\r
+ KERNEL2x2_L2 64,64,15,1 \r
+ bdnz ZGEMM_L2x2_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+ZGEMM_L2x2_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END2x2_2 \r
+ blr\r
+ MY_ALIGN\r
+ZGEMM_2x2_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x2_2\r
+ KERNEL2x2_L2 64,64,0,0\r
+ KERNEL2x2_L2 64,64,1,0 \r
+ KERNEL2x2_L2 64,64,2,0\r
+ KERNEL2x2_L2 64,64,3,0 \r
+ KERNEL2x2_L2 64,64,4,0\r
+ KERNEL2x2_L2 64,64,5,0 \r
+ KERNEL2x2_L2 64,64,6,0\r
+ KERNEL2x2_E2 64,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+ZGEMM_2x2_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x2_2\r
+ KERNEL2x2_L2 64,64,0,0\r
+ KERNEL2x2_L2 64,64,1,0 \r
+ KERNEL2x2_L2 64,64,2,0\r
+ KERNEL2x2_E2 64,64,3,1 \r
+ blr\r
+\r
+\r
+ZGEMM_2x1_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x1_2 \r
+ MY_ALIGN\r
+ZGEMM_L2x1_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL2x1_L2 32,64,0,0 \r
+ZGEMM_L2x1_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL2x1_L2 32,64,1,0 \r
+ KERNEL2x1_L2 32,64,2,0\r
+ KERNEL2x1_L2 32,64,3,0 \r
+ KERNEL2x1_L2 32,64,4,0\r
+ KERNEL2x1_L2 32,64,5,0 \r
+ KERNEL2x1_L2 32,64,6,0\r
+ KERNEL2x1_L2 32,64,7,0\r
+ KERNEL2x1_L2 32,64,8,0\r
+ KERNEL2x1_L2 32,64,9,0 \r
+ KERNEL2x1_L2 32,64,10,0\r
+ KERNEL2x1_L2 32,64,11,0 \r
+ KERNEL2x1_L2 32,64,12,0\r
+ KERNEL2x1_L2 32,64,13,0 \r
+ KERNEL2x1_L2 32,64,14,0\r
+ KERNEL2x1_L2 32,64,15,1 \r
+ bdnz ZGEMM_L2x1_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L2x1_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END2x1_2 \r
+ blr\r
+\r
+ MY_ALIGN\r
+ZGEMM_2x1_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x1_2\r
+ KERNEL2x1_L2 32,64,0,0\r
+ KERNEL2x1_L2 32,64,1,0 \r
+ KERNEL2x1_L2 32,64,2,0\r
+ KERNEL2x1_L2 32,64,3,0 \r
+ KERNEL2x1_L2 32,64,4,0\r
+ KERNEL2x1_L2 32,64,5,0 \r
+ KERNEL2x1_L2 32,64,6,0\r
+ KERNEL2x1_E2 32,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_2x1_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x1_2\r
+ KERNEL2x1_L2 32,64,0,0\r
+ KERNEL2x1_L2 32,64,1,0 \r
+ KERNEL2x1_L2 32,64,2,0\r
+ KERNEL2x1_E2 32,64,3,1 \r
blr\r
\r
-/* MAIN LOOP BEGINS */\r
\r
- MY_ALIGN\r
+\r
+/* MAIN LOOP BEGINS */ \r
+ MY_ALIGN\r
+\r
+\r
ZGEMM_L2:\r
- srawi. J, N, 1\r
- ble ZGEMM_L2_END\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ neg TEMP_REG, OFFSET \r
+#endif \r
+ srawi. J, N, 1\r
+ ble ZGEMM_L2_END\r
+\r
\r
ZGEMM_L2_BEGIN:\r
- mr CO, C\r
- slwi T1, LDC , 1 \r
+/*----------------------------------------*/ \r
+ mr CO, C\r
+ slwi T1, LDC , 1 \r
add T2,C,LDC \r
- mr AO, A \r
- add C, C, T1\r
- srawi. I, M, 3\r
- ble ZGEMM_L2x8_END\r
+ mr AO, A \r
+ add C, C, T1\r
+#if defined(TRMMKERNEL) && defined(LEFT) \r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ srawi. I, M, 3\r
+ ble ZGEMM_L2x8_END\r
dcbt CO,r0 /*just prefetch*/\r
dcbt T2,r0 \r
-ZGEMM_L2x8_BEGIN: \r
- mr T1, K\r
- mr BO, B \r
- dcbt B, r0 \r
- dcbt AO, r0 \r
- /* TEMPS FOR PREFETCH */\r
- li T2, 1024\r
- li T3, 1024+512\r
-\r
- addi T1,T1, -1\r
- /* TEMPS FOR PREFETCH */ \r
- li T4, 2048\r
- li T5, 2048+512 \r
- srawi. L, T1, 7 /**(K-1) % 128x */ \r
-\r
- ZERO2x8 \r
- ble ZGEMM_L2x8_SUB0\r
- bl ZGEMM_L2x8_LMAIN_SUB \r
- \r
- andi. L, T1, 127\r
- ble ZGEMM_L2x8_SAVE\r
- b ZGEMM_L2x8_SUB2\r
- \r
-ZGEMM_L2x8_SUB0: \r
- andi. L, K, 255\r
+\r
+\r
+ZGEMM_L2x8_BEGIN:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2\r
+#else \r
+ mr BO, B \r
+ dcbt B, r0 \r
+#endif \r
+ dcbt AO, r0\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2\r
+ mr T1, T6\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(T11-2) % 128x */\r
+#else \r
+ mr T1, K\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(K-2) % 128x */\r
+#endif \r
+ ZERO2x8 \r
+ ble ZGEMM_L2x8_SUB0\r
+ bl ZGEMM_L2x8_LMAIN_SUB\r
+ andi. L, T1, 127\r
+ ble ZGEMM_L2x8_SAVE\r
+ b ZGEMM_L2x8_SUB2\r
+\r
+\r
+ZGEMM_L2x8_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 255\r
+ cmpwi T6,129\r
+#else \r
+ andi. L, K, 255\r
+ cmpwi K,129\r
+#endif \r
+ li T8,1\r
+ bne CMP2x8_128K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-128 \r
+ LOAD2x8O 128,32 \r
+ END2x8_WITHOUT_ADD \r
+ LOAD2x8_2O 256, 64 \r
+ mtctr T8 \r
+ bl ZGEMM_L2x8_K128 \r
+ b ZGEMM_L2x8_SAVE \r
+ CMP2x8_128K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,128\r
+#else \r
cmpwi K,128\r
- bne ZGEMM_L2x8_SUB2 \r
- MY_ALIGN \r
-ZGEMM_L2x8_SUB2_128:\r
- bl ZGEMM_2x8_L64_SUB\r
- bl ZGEMM_2x8_L64_SUB \r
- b ZGEMM_L2x8_SAVE \r
+#endif \r
+ bne ZGEMM_L2x8_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-256 \r
+ LOAD2x8_2O 256,64\r
+ bl ZGEMM_L2x8_K128 \r
+ b ZGEMM_L2x8_SAVE \r
MY_ALIGN\r
+\r
+\r
ZGEMM_L2x8_SUB2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 64\r
- ble ZGEMM_L2x8_SUB2_32\r
- bl ZGEMM_2x8_L64_SUB\r
+ ble ZGEMM_L2x8_SUB2_32\r
+ bl ZGEMM_2x8_L64_SUB\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L2x8_SUB2_32:\r
+/*----------------------------------------*/ \r
andi. T1,L, 32\r
- ble ZGEMM_L2x8_SUB2_16 \r
- bl ZGEMM_2x8_L32_SUB\r
+ ble ZGEMM_L2x8_SUB2_16 \r
+ bl ZGEMM_2x8_L32_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x8_SUB2_16:\r
+/*----------------------------------------*/ \r
andi. T1,L, 16\r
ble ZGEMM_L2x8_SUB2_8\r
- bl ZGEMM_2x8_L16_SUB \r
- MY_ALIGN \r
+ bl ZGEMM_2x8_L16_SUB \r
+ MY_ALIGN \r
+\r
+\r
ZGEMM_L2x8_SUB2_8:\r
+/*----------------------------------------*/ \r
andi. T1,L, 8\r
ble ZGEMM_L2x8_SUB2_4\r
- LOAD2x8 0 \r
- KERNEL2x8_L 128,32, 0,0\r
- KERNEL2x8_L 128,32, 1,0\r
- KERNEL2x8_L 128,32, 2,0\r
- KERNEL2x8_E 128,32, 3,1\r
- MY_ALIGN \r
+ LOAD2x8_2\r
+ KERNEL2x8_L2 256,64, 0,0\r
+ KERNEL2x8_L2 256,64, 1,0\r
+ KERNEL2x8_L2 256,64, 2,0\r
+ KERNEL2x8_E2 256,64, 3,1\r
+ MY_ALIGN \r
+\r
+\r
ZGEMM_L2x8_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L2x8_SUB2_2\r
- LOAD2x8 0 \r
- KERNEL2x8_L 128,32, 0,0\r
- KERNEL2x8_E 128,32, 1,1\r
+ LOAD2x8_2\r
+ KERNEL2x8_L2 256,64, 0,0\r
+ KERNEL2x8_E2 256,64, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L2x8_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L2x8_SUB2_1\r
- LOAD2x8 0 \r
- KERNEL2x8_E 128,32, 0,1\r
+ LOAD2x8_2 \r
+ KERNEL2x8_E2 256,64, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x8_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L2x8_SAVE \r
- KERNEL2x8 \r
+ ble ZGEMM_L2x8_SAVE \r
+ KERNEL2x8\r
\r
-ZGEMM_L2x8_SAVE:\r
- addic. I, I, -1\r
- SAVE2x8\r
\r
- bgt ZGEMM_L2x8_BEGIN\r
+ZGEMM_L2x8_SAVE:\r
+/*----------------------------------------*/ \r
+ addic. I, I, -1\r
+ SAVE2x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2\r
+#endif \r
+ bgt ZGEMM_L2x8_BEGIN\r
+ andi. T2, M, 7\r
+ ble ZGEMM_L2x1_END\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L2x4_END\r
+ b ZGEMM_L2x4_BEGIN\r
+ MY_ALIGN \r
\r
- andi. T2, M, 7\r
- ble ZGEMM_L2x1_END\r
\r
- andi. T1, M, 4\r
- ble ZGEMM_L2x4_END\r
- b ZGEMM_L2x4_BEGIN\r
- MY_ALIGN \r
ZGEMM_L2x8_END:\r
+/*----------------------------------------*/ \r
\r
-ZGEMM_L2x4_BEGIN:\r
-\r
- andi. T2, M, 7\r
- ble ZGEMM_L2x1_END\r
\r
- andi. T1, M, 4\r
- ble ZGEMM_L2x4_END\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- ZERO2x4 \r
- srawi. L, T1, 5 /**(K-1) % 32x */ \r
-\r
- ble ZGEMM_L2x4_SUB0 \r
+ZGEMM_L2x4_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T2, M, 7\r
+ ble ZGEMM_L2x1_END\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L2x4_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T11-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO2x4\r
+ ble ZGEMM_L2x4_SUB0 \r
bl ZGEMM_2x4_LMAIN_SUB\r
- andi. L, T1, 31\r
- ble ZGEMM_L2x4_SAVE\r
- b ZGEMM_L2x4_SUB2\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L2x4_SAVE\r
+ b ZGEMM_L2x4_SUB2\r
+\r
\r
ZGEMM_L2x4_SUB0:\r
- andi. L, K, 63\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP2x4_32K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-64 \r
+ LOAD2x4O 64,32 \r
+ END2x4_WITHOUT_ADD \r
+ LOAD2x4_2O 128, 64 \r
+ mtctr T8 \r
+ bl ZGEMM_L2x4_K32 \r
+ b ZGEMM_L2x4_SAVE \r
+ CMP2x4_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
cmpwi K,32\r
- bne ZGEMM_L2x4_SUB2 \r
- MY_ALIGN \r
-ZGEMM_L2x4_SUB2_32:\r
- bl ZGEMM_2x4_L16_SUB\r
- bl ZGEMM_2x4_L16_SUB \r
- b ZGEMM_L2x4_SAVE \r
+#endif \r
+ bne ZGEMM_L2x4_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-128 \r
+ LOAD2x4_2O 128,64\r
+ bl ZGEMM_L2x4_K32 \r
+ b ZGEMM_L2x4_SAVE \r
+ MY_ALIGN \r
MY_ALIGN \r
-ZGEMM_L2x4_SUB2: \r
+\r
+\r
+ZGEMM_L2x4_SUB2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 16\r
ble ZGEMM_L2x4_SUB2_8\r
- bl ZGEMM_2x4_L16_SUB \r
+ bl ZGEMM_2x4_L16_SUB \r
MY_ALIGN\r
-ZGEMM_L2x4_SUB2_8: \r
+\r
+\r
+ZGEMM_L2x4_SUB2_8:\r
+/*----------------------------------------*/ \r
andi. T1,L, 8\r
ble ZGEMM_L2x4_SUB2_4\r
bl ZGEMM_2x4_L8_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x4_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L2x4_SUB2_2\r
- LOAD2x4 0 \r
- KERNEL2x4_L 64,32, 0,0\r
- KERNEL2x4_E 64,32, 1,1\r
+ LOAD2x4_2\r
+ KERNEL2x4_L2 128,64, 0,0\r
+ KERNEL2x4_E2 128,64, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L2x4_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L2x4_SUB2_1\r
- LOAD2x4 0 \r
- KERNEL2x4_E 64,32, 0,1\r
+ LOAD2x4_2\r
+ KERNEL2x4_E2 128,64, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x4_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L2x4_SAVE \r
- KERNEL2x4 \r
+ ble ZGEMM_L2x4_SAVE \r
+ KERNEL2x4\r
+\r
\r
ZGEMM_L2x4_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE2x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2\r
+#endif \r
\r
- SAVE2x4\r
\r
ZGEMM_L2x4_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+ZGEMM_L2x2_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 2\r
+ ble ZGEMM_L2x2_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T11-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO2x2\r
+ ble ZGEMM_L2x2_SUB0 \r
+ bl ZGEMM_2x2_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L2x2_SAVE\r
+ b ZGEMM_L2x2_SUB2\r
+\r
\r
-ZGEMM_L2x2_BEGIN: \r
-\r
- andi. T1, M, 2\r
- ble ZGEMM_L2x2_END\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 4 /**(K-1) % 16x */ \r
- ZERO2x2 \r
- ble ZGEMM_L2x2_SUB0 \r
-\r
-ZGEMM_L2x2_LOOP_START:\r
- LOAD2x2 0 \r
- mtctr L\r
-\r
- MY_ALIGN\r
-ZGEMM_L2x2_LOOP: \r
- KERNEL2x2_L 32,32,0,0\r
- KERNEL2x2_L 32,32,1,0 \r
- KERNEL2x2_L 32,32,2,0\r
- KERNEL2x2_L 32,32,3,0 \r
- KERNEL2x2_L 32,32,4,0\r
- KERNEL2x2_L 32,32,5,0 \r
- KERNEL2x2_L 32,32,6,0\r
- KERNEL2x2_L 32,32,7,1 \r
- bdnz ZGEMM_L2x2_LOOP\r
- MY_ALIGN \r
-ZGEMM_L2x2_LOOP_END:\r
- END2x2 AO, BO, 32,32 \r
- \r
- b ZGEMM_L2x2_SUB1\r
- \r
ZGEMM_L2x2_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP2x2_32K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-32 \r
+ LOAD2x2O 32,32 \r
+ END2x2_WITHOUT_ADD \r
+ LOAD2x2_2O 64, 64 \r
+ mtctr T8 \r
+ bl ZGEMM_L2x2_K32 \r
+ b ZGEMM_L2x2_SAVE \r
+ CMP2x2_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne ZGEMM_L2x2_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-64 \r
+ LOAD2x2_2O 64,64\r
+ bl ZGEMM_L2x2_K32 \r
+ b ZGEMM_L2x2_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
\r
- andi. L, K, 31\r
- \r
- b ZGEMM_L2x2_SUB2\r
\r
-ZGEMM_L2x2_SUB1:\r
+ZGEMM_L2x2_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L2x2_SUB2_8\r
+ bl ZGEMM_2x2_L16_SUB \r
+ MY_ALIGN\r
\r
- andi. L, T1, 15\r
- ble ZGEMM_L2x2_SAVE\r
\r
-ZGEMM_L2x2_SUB2:\r
- srawi. T1,L, 3\r
+ZGEMM_L2x2_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
ble ZGEMM_L2x2_SUB2_4\r
- mtctr T1\r
- MY_ALIGN\r
-ZGEMM_L2x2_SUB2_LOOP:\r
- LOAD2x2 0 \r
- KERNEL2x2_L 32,32, 0,0\r
- KERNEL2x2_L 32,32, 1,0\r
- KERNEL2x2_L 32,32, 2,0\r
- KERNEL2x2_E 32,32, 3,1\r
- bdnz ZGEMM_L2x2_SUB2_LOOP \r
+ bl ZGEMM_2x2_L8_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x2_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L2x2_SUB2_2\r
- LOAD2x2 0 \r
- KERNEL2x2_L 32,32, 0,0\r
- KERNEL2x2_E 32,32, 1,1\r
+ LOAD2x2_2\r
+ KERNEL2x2_L2 64,64, 0,0\r
+ KERNEL2x2_E2 64,64, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L2x2_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L2x2_SUB2_1\r
- LOAD2x2 0 \r
- KERNEL2x2_E 32,32, 0,1\r
+ LOAD2x2_2\r
+ KERNEL2x2_E2 64,64, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x2_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L2x2_SAVE \r
- KERNEL2x2 \r
+ ble ZGEMM_L2x2_SAVE \r
+ KERNEL2x2\r
+\r
+\r
ZGEMM_L2x2_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE2x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2\r
+#endif \r
\r
- SAVE2x2\r
\r
ZGEMM_L2x2_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+ZGEMM_L2x1_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 1\r
+ ble ZGEMM_L2x1_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T11-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO2x1\r
+ ble ZGEMM_L2x1_SUB0 \r
+ bl ZGEMM_2x1_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L2x1_SAVE\r
+ b ZGEMM_L2x1_SUB2\r
\r
\r
-\r
-ZGEMM_L2x1_BEGIN: \r
- andi. T1, M, 1\r
- ble ZGEMM_L2x1_END\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 4 /**(K-1) % 16x */ \r
- ZERO2x1 \r
- ble ZGEMM_L2x1_SUB0 \r
-\r
-ZGEMM_L2x1_LOOP_START:\r
-\r
- LOAD2x1 0 \r
- mtctr L\r
-\r
- MY_ALIGN\r
-ZGEMM_L2x1_LOOP: \r
- KERNEL2x1_L 16,32,0,0\r
- KERNEL2x1_L 16,32,1,0 \r
- KERNEL2x1_L 16,32,2,0\r
- KERNEL2x1_L 16,32,3,0 \r
- KERNEL2x1_L 16,32,4,0\r
- KERNEL2x1_L 16,32,5,0 \r
- KERNEL2x1_L 16,32,6,0\r
- KERNEL2x1_L 16,32,7,1 \r
- bdnz ZGEMM_L2x1_LOOP\r
- MY_ALIGN \r
-ZGEMM_L2x1_LOOP_END:\r
- END2x1 AO, BO, 16,32 \r
- \r
- b ZGEMM_L2x1_SUB1\r
- \r
ZGEMM_L2x1_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP2x1_32K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-16 \r
+ LOAD2x1O 16,32 \r
+ END2x1_WITHOUT_ADD \r
+ LOAD2x1_2O 32, 64 \r
+ mtctr T8 \r
+ bl ZGEMM_L2x1_K32 \r
+ b ZGEMM_L2x1_SAVE \r
+ CMP2x1_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne ZGEMM_L2x1_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-32 \r
+ LOAD2x1_2O 32,64\r
+ bl ZGEMM_L2x1_K32 \r
+ b ZGEMM_L2x1_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
\r
- andi. L, K, 31\r
- \r
- b ZGEMM_L2x1_SUB2\r
\r
-ZGEMM_L2x1_SUB1:\r
+ZGEMM_L2x1_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L2x1_SUB2_8\r
+ bl ZGEMM_2x1_L16_SUB \r
+ MY_ALIGN\r
\r
- andi. L, T1, 15\r
- ble ZGEMM_L2x1_SAVE\r
\r
-ZGEMM_L2x1_SUB2:\r
- srawi. T1,L, 3\r
+ZGEMM_L2x1_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
ble ZGEMM_L2x1_SUB2_4\r
- mtctr T1\r
- MY_ALIGN\r
-ZGEMM_L2x1_SUB2_LOOP:\r
- LOAD2x1 0 \r
- KERNEL2x1_L 16,32, 0,0\r
- KERNEL2x1_L 16,32, 1,0\r
- KERNEL2x1_L 16,32, 2,0\r
- KERNEL2x1_E 16,32, 3,1\r
- bdnz ZGEMM_L2x1_SUB2_LOOP \r
+ bl ZGEMM_2x1_L8_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x1_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L2x1_SUB2_2\r
- LOAD2x1 0 \r
- KERNEL2x1_L 16,32, 0,0\r
- KERNEL2x1_E 16,32, 1,1\r
+ LOAD2x1_2\r
+ KERNEL2x1_L2 32,64, 0,0\r
+ KERNEL2x1_E2 32,64, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L2x1_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L2x1_SUB2_1\r
- LOAD2x1 0 \r
- KERNEL2x1_E 16,32, 0,1\r
+ LOAD2x1_2\r
+ KERNEL2x1_E2 32,64, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L2x1_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L2x1_SAVE \r
- KERNEL2x1 \r
+ ble ZGEMM_L2x1_SAVE \r
+ KERNEL2x1\r
+\r
\r
ZGEMM_L2x1_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE2x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2\r
+#endif \r
\r
- SAVE2x1\r
\r
ZGEMM_L2x1_END:\r
+/*----------------------------------------*/ \r
+ slwi T1, K, 5\r
+ addic. J, J, -1\r
+ add B, B, T1\r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ addi TEMP_REG, TEMP_REG, 2\r
+#endif \r
+ bgt ZGEMM_L2_BEGIN\r
\r
- slwi T1, K, 5\r
- add B, B, T1\r
\r
- addic. J, J, -1\r
- bgt ZGEMM_L2_BEGIN\r
+ZGEMM_L2_END:\r
\r
- andi. T2, N, 1\r
- ble L999\r
+b ZGEMM_L1\r
+/* MINI SUBROUTINES */ \r
+/* 1x8 MAIN 128x+2 LOOP */ \r
\r
-ZGEMM_L2_END:\r
\r
- b ZGEMM_L1_BEGIN\r
+ZGEMM_L1x8_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x8_2 \r
+ MY_ALIGN\r
+ZGEMM_L1x8_LOOP:\r
+/*----------------------------------------*/ \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 256,32,0,0 \r
+ZGEMM_L1x8_K128:\r
+/*----------------------------------------*/ \r
+ KERNEL1x8_L2 256,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 256,32,2,0\r
+ KERNEL1x8_L2 256,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 256,32,4,0\r
+ KERNEL1x8_L2 256,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 256,32,6,0\r
+ KERNEL1x8_L2 256,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L2 256,32,8,0\r
+ KERNEL1x8_L2 256,32,9,0\r
+ KERNEL1x8_L2 256,32,10,0\r
+ KERNEL1x8_L2 256,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L2 256,32,12,0\r
+ KERNEL1x8_L2 256,32,13,0\r
+ KERNEL1x8_L2 256,32,14,0\r
+ KERNEL1x8_L2 256,32,15,0 \r
+ KERNEL1x8_L2 256,32,16,0\r
+ KERNEL1x8_L2 256,32,17,0 \r
+ KERNEL1x8_L2 256,32,18,0\r
+ KERNEL1x8_L2 256,32,19,0 \r
+ KERNEL1x8_L2 256,32,20,0\r
+ KERNEL1x8_L2 256,32,21,0 \r
+ KERNEL1x8_L2 256,32,22,0\r
+ KERNEL1x8_L2 256,32,23,0 \r
+ KERNEL1x8_L2 256,32,24,0\r
+ KERNEL1x8_L2 256,32,25,0\r
+ KERNEL1x8_L2 256,32,26,0\r
+ KERNEL1x8_L2 256,32,27,0 \r
+ KERNEL1x8_L2 256,32,28,0\r
+ KERNEL1x8_L2 256,32,29,0\r
+ KERNEL1x8_L2 256,32,30,0\r
+ KERNEL1x8_L2 256,32,31,0 \r
+ KERNEL1x8_L2 256,32,32,0\r
+ KERNEL1x8_L2 256,32,33,0\r
+ KERNEL1x8_L2 256,32,34,0\r
+ KERNEL1x8_L2 256,32,35,0 \r
+ KERNEL1x8_L2 256,32,36,0\r
+ KERNEL1x8_L2 256,32,37,0\r
+ KERNEL1x8_L2 256,32,38,0\r
+ KERNEL1x8_L2 256,32,39,0 \r
+ KERNEL1x8_L2 256,32,40,0\r
+ KERNEL1x8_L2 256,32,41,0\r
+ KERNEL1x8_L2 256,32,42,0\r
+ KERNEL1x8_L2 256,32,43,0 \r
+ KERNEL1x8_L2 256,32,44,0\r
+ KERNEL1x8_L2 256,32,45,0\r
+ KERNEL1x8_L2 256,32,46,0\r
+ KERNEL1x8_L2 256,32,47,0 \r
+ KERNEL1x8_L2 256,32,48,0\r
+ KERNEL1x8_L2 256,32,49,0 \r
+ KERNEL1x8_L2 256,32,50,0\r
+ KERNEL1x8_L2 256,32,51,0 \r
+ KERNEL1x8_L2 256,32,52,0\r
+ KERNEL1x8_L2 256,32,53,0 \r
+ KERNEL1x8_L2 256,32,54,0\r
+ KERNEL1x8_L2 256,32,55,0 \r
+ KERNEL1x8_L2 256,32,56,0\r
+ KERNEL1x8_L2 256,32,57,0\r
+ KERNEL1x8_L2 256,32,58,0\r
+ KERNEL1x8_L2 256,32,59,0 \r
+ KERNEL1x8_L2 256,32,60,0\r
+ KERNEL1x8_L2 256,32,61,0\r
+ KERNEL1x8_L2 256,32,62,0 \r
+ KERNEL1x8_L2 256,32,63,1 \r
+ bdnz ZGEMM_L1x8_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L1x8_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x8_2\r
+ blr\r
+ MY_ALIGN\r
+\r
\r
-L999_H1:\r
+ZGEMM_1x8_L64_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 256,32,0,0 \r
+ KERNEL1x8_L2 256,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 256,32,2,0\r
+ KERNEL1x8_L2 256,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 256,32,4,0\r
+ KERNEL1x8_L2 256,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 256,32,6,0\r
+ KERNEL1x8_L2 256,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L2 256,32,8,0\r
+ KERNEL1x8_L2 256,32,9,0\r
+ KERNEL1x8_L2 256,32,10,0\r
+ KERNEL1x8_L2 256,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L2 256,32,12,0\r
+ KERNEL1x8_L2 256,32,13,0\r
+ KERNEL1x8_L2 256,32,14,0\r
+ KERNEL1x8_L2 256,32,15,0 \r
+ KERNEL1x8_L2 256,32,16,0\r
+ KERNEL1x8_L2 256,32,17,0 \r
+ KERNEL1x8_L2 256,32,18,0\r
+ KERNEL1x8_L2 256,32,19,0 \r
+ KERNEL1x8_L2 256,32,20,0\r
+ KERNEL1x8_L2 256,32,21,0 \r
+ KERNEL1x8_L2 256,32,22,0\r
+ KERNEL1x8_L2 256,32,23,0 \r
+ KERNEL1x8_L2 256,32,24,0\r
+ KERNEL1x8_L2 256,32,25,0\r
+ KERNEL1x8_L2 256,32,26,0\r
+ KERNEL1x8_L2 256,32,27,0 \r
+ KERNEL1x8_L2 256,32,28,0\r
+ KERNEL1x8_L2 256,32,29,0\r
+ KERNEL1x8_L2 256,32,30,0\r
+ KERNEL1x8_E2 256,32,31,1\r
+ blr\r
+ MY_ALIGN\r
\r
- b L999\r
\r
-ZGEMM_L1_BEGIN:\r
- andi. T1, N, 1\r
- ble ZGEMM_L1_END\r
+ZGEMM_1x8_L32_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 256,32,0,0 \r
+ KERNEL1x8_L2 256,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 256,32,2,0\r
+ KERNEL1x8_L2 256,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 256,32,4,0\r
+ KERNEL1x8_L2 256,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 256,32,6,0\r
+ KERNEL1x8_L2 256,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L2 256,32,8,0\r
+ KERNEL1x8_L2 256,32,9,0\r
+ KERNEL1x8_L2 256,32,10,0\r
+ KERNEL1x8_L2 256,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L2 256,32,12,0\r
+ KERNEL1x8_L2 256,32,13,0\r
+ KERNEL1x8_L2 256,32,14,0\r
+ KERNEL1x8_E2 256,32,15,1\r
+ blr\r
+ MY_ALIGN\r
\r
- mr CO, C\r
- mr AO, A\r
- srawi. I, M, 3\r
- ble ZGEMM_L1x8_END\r
\r
-ZGEMM_L1x8_BEGIN:\r
+ZGEMM_1x8_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 256,32,0,0 \r
+ KERNEL1x8_L2 256,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 256,32,2,0\r
+ KERNEL1x8_L2 256,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 256,32,4,0\r
+ KERNEL1x8_L2 256,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 256,32,6,0\r
+ KERNEL1x8_E2 256,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_1x4_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x4_2 \r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_L1x4_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL1x4_L2 128,32,0,0\r
+\r
+\r
+ZGEMM_L1x4_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL1x4_L2 128,32,1,0 \r
+ KERNEL1x4_L2 128,32,2,0\r
+ KERNEL1x4_L2 128,32,3,0 \r
+ KERNEL1x4_L2 128,32,4,0\r
+ KERNEL1x4_L2 128,32,5,0 \r
+ KERNEL1x4_L2 128,32,6,0\r
+ KERNEL1x4_L2 128,32,7,0\r
+ KERNEL1x4_L2 128,32,8,0\r
+ KERNEL1x4_L2 128,32,9,0 \r
+ KERNEL1x4_L2 128,32,10,0\r
+ KERNEL1x4_L2 128,32,11,0 \r
+ KERNEL1x4_L2 128,32,12,0\r
+ KERNEL1x4_L2 128,32,13,0 \r
+ KERNEL1x4_L2 128,32,14,0\r
+ KERNEL1x4_L2 128,32,15,1 \r
+ bdnz ZGEMM_L1x4_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+ZGEMM_L1x4_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x4_2 \r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_1x4_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x4_2\r
+ KERNEL1x4_L2 128,32,0,0\r
+ KERNEL1x4_L2 128,32,1,0 \r
+ KERNEL1x4_L2 128,32,2,0\r
+ KERNEL1x4_L2 128,32,3,0 \r
+ KERNEL1x4_L2 128,32,4,0\r
+ KERNEL1x4_L2 128,32,5,0 \r
+ KERNEL1x4_L2 128,32,6,0\r
+ KERNEL1x4_E2 128,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_1x4_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x4_2\r
+ KERNEL1x4_L2 128,32,0,0\r
+ KERNEL1x4_L2 128,32,1,0 \r
+ KERNEL1x4_L2 128,32,2,0\r
+ KERNEL1x4_E2 128,32,3,1 \r
+ blr\r
+\r
+\r
+ZGEMM_1x2_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x2_2 \r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_L1x2_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL1x2_L2 64,32,0,0\r
+\r
+\r
+ZGEMM_L1x2_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL1x2_L2 64,32,1,0 \r
+ KERNEL1x2_L2 64,32,2,0\r
+ KERNEL1x2_L2 64,32,3,0 \r
+ KERNEL1x2_L2 64,32,4,0\r
+ KERNEL1x2_L2 64,32,5,0 \r
+ KERNEL1x2_L2 64,32,6,0\r
+ KERNEL1x2_L2 64,32,7,0\r
+ KERNEL1x2_L2 64,32,8,0\r
+ KERNEL1x2_L2 64,32,9,0 \r
+ KERNEL1x2_L2 64,32,10,0\r
+ KERNEL1x2_L2 64,32,11,0 \r
+ KERNEL1x2_L2 64,32,12,0\r
+ KERNEL1x2_L2 64,32,13,0 \r
+ KERNEL1x2_L2 64,32,14,0\r
+ KERNEL1x2_L2 64,32,15,1 \r
+ bdnz ZGEMM_L1x2_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+ZGEMM_L1x2_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x2_2 \r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_1x2_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x2_2\r
+ KERNEL1x2_L2 64,32,0,0\r
+ KERNEL1x2_L2 64,32,1,0 \r
+ KERNEL1x2_L2 64,32,2,0\r
+ KERNEL1x2_L2 64,32,3,0 \r
+ KERNEL1x2_L2 64,32,4,0\r
+ KERNEL1x2_L2 64,32,5,0 \r
+ KERNEL1x2_L2 64,32,6,0\r
+ KERNEL1x2_E2 64,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
\r
+ZGEMM_1x2_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x2_2\r
+ KERNEL1x2_L2 64,32,0,0\r
+ KERNEL1x2_L2 64,32,1,0 \r
+ KERNEL1x2_L2 64,32,2,0\r
+ KERNEL1x2_E2 64,32,3,1 \r
+ blr\r
+\r
+\r
+ZGEMM_1x1_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x1_2 \r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_L1x1_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL1x1_L2 32,32,0,0\r
+\r
+\r
+ZGEMM_L1x1_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL1x1_L2 32,32,1,0 \r
+ KERNEL1x1_L2 32,32,2,0\r
+ KERNEL1x1_L2 32,32,3,0 \r
+ KERNEL1x1_L2 32,32,4,0\r
+ KERNEL1x1_L2 32,32,5,0 \r
+ KERNEL1x1_L2 32,32,6,0\r
+ KERNEL1x1_L2 32,32,7,0\r
+ KERNEL1x1_L2 32,32,8,0\r
+ KERNEL1x1_L2 32,32,9,0 \r
+ KERNEL1x1_L2 32,32,10,0\r
+ KERNEL1x1_L2 32,32,11,0 \r
+ KERNEL1x1_L2 32,32,12,0\r
+ KERNEL1x1_L2 32,32,13,0 \r
+ KERNEL1x1_L2 32,32,14,0\r
+ KERNEL1x1_L2 32,32,15,1 \r
+ bdnz ZGEMM_L1x1_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+ZGEMM_L1x1_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x1_2 \r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_1x1_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x1_2\r
+ KERNEL1x1_L2 32,32,0,0\r
+ KERNEL1x1_L2 32,32,1,0 \r
+ KERNEL1x1_L2 32,32,2,0\r
+ KERNEL1x1_L2 32,32,3,0 \r
+ KERNEL1x1_L2 32,32,4,0\r
+ KERNEL1x1_L2 32,32,5,0 \r
+ KERNEL1x1_L2 32,32,6,0\r
+ KERNEL1x1_E2 32,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+ZGEMM_1x1_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x1_2\r
+ KERNEL1x1_L2 32,32,0,0\r
+ KERNEL1x1_L2 32,32,1,0 \r
+ KERNEL1x1_L2 32,32,2,0\r
+ KERNEL1x1_E2 32,32,3,1 \r
+ blr\r
\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 5 /**(K-1) % 32x */ \r
- ZERO1x8 \r
- ble ZGEMM_L1x8_SUB0\r
- \r
\r
-ZGEMM_L1x8_LOOP_START:\r
+/*----------------------N1 BEGINS---------*/\r
+ZGEMM_L1:\r
+/*----------------------------------------*/ \r
+ andi. T1, N, 1\r
+ ble ZGEMM_L1_END\r
+ \r
+ZGEMM_L1_BEGIN:\r
+/*----------------------------------------*/ \r
+ mr CO, C\r
+ slwi T1, LDC , 1 \r
+ add T2,C,LDC \r
+ mr AO, A \r
+ add C, C, T1\r
+#if defined(TRMMKERNEL) && defined(LEFT) \r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ srawi. I, M, 3\r
+ ble ZGEMM_L1x8_END\r
+ dcbt CO,r0 /*just prefetch*/\r
+ dcbt T2,r0 \r
\r
- LOAD1x8 0 \r
+\r
+ZGEMM_L1x8_BEGIN:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1\r
+#else \r
+ mr BO, B \r
+ dcbt B, r0 \r
+#endif \r
+ dcbt AO, r0\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1\r
+ mr T1, T6\r
+/* TEMPS FOR PREFETCH */ \r
li T2, 1024\r
- li T3, 1024+512\r
- li T4, 2048\r
- li T5, 2048+512\r
- mtctr L\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(T11-2) % 128x */\r
+#else \r
+ mr T1, K\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(K-2) % 128x */\r
+#endif \r
+ ZERO1x8 \r
+ ble ZGEMM_L1x8_SUB0\r
+ bl ZGEMM_L1x8_LMAIN_SUB\r
+ andi. L, T1, 127\r
+ ble ZGEMM_L1x8_SAVE\r
+ b ZGEMM_L1x8_SUB2\r
+\r
\r
- MY_ALIGN\r
-ZGEMM_L1x8_LOOP:\r
- dcbt AO, PRE\r
- dcbt BO, PRE\r
- KERNEL1x8_L 128,16,0,0\r
- KERNEL1x8_L 128,16,1,0\r
- dcbt AO, T2 \r
- KERNEL1x8_L 128,16,2,0\r
- KERNEL1x8_L 128,16,3,0 \r
- dcbt AO, T3\r
- dcbt BO, T2\r
- KERNEL1x8_L 128,16,4,0\r
- KERNEL1x8_L 128,16,5,0\r
- dcbt AO, T4 \r
- KERNEL1x8_L 128,16,6,0\r
- KERNEL1x8_L 128,16,7,0 \r
- dcbt AO, T5 \r
- dcbt BO, T3\r
- KERNEL1x8_L 128,16,8,0\r
- KERNEL1x8_L 128,16,9,0\r
- KERNEL1x8_L 128,16,10,0\r
- KERNEL1x8_L 128,16,11,0 \r
- dcbt BO, T4\r
- KERNEL1x8_L 128,16,12,0\r
- KERNEL1x8_L 128,16,13,0\r
- KERNEL1x8_L 128,16,14,0\r
- KERNEL1x8_L 128,16,15,1 \r
- bdnz ZGEMM_L1x8_LOOP\r
- MY_ALIGN \r
-ZGEMM_L1x8_LOOP_END:\r
- END1x8 AO, BO, 128,16 \r
- \r
- b ZGEMM_L1x8_SUB1\r
- \r
ZGEMM_L1x8_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 255\r
+ cmpwi T6,129\r
+#else \r
+ andi. L, K, 255\r
+ cmpwi K,129\r
+#endif \r
+ li T8,1\r
+ bne CMP1x8_128K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-128 \r
+ LOAD1x8O 128,16 \r
+ END1x8_WITHOUT_ADD \r
+ LOAD1x8_2O 256, 32 \r
+ mtctr T8 \r
+ bl ZGEMM_L1x8_K128 \r
+ b ZGEMM_L1x8_SAVE \r
+ CMP1x8_128K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,128\r
+#else \r
+ cmpwi K,128\r
+#endif \r
+ bne ZGEMM_L1x8_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-256 \r
+ LOAD1x8_2O 256,32\r
+ bl ZGEMM_L1x8_K128 \r
+ b ZGEMM_L1x8_SAVE \r
+ MY_ALIGN\r
\r
- andi. L, K, 63\r
- \r
- b ZGEMM_L1x8_SUB2\r
\r
-ZGEMM_L1x8_SUB1:\r
+ZGEMM_L1x8_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 64\r
+ ble ZGEMM_L1x8_SUB2_32\r
+ bl ZGEMM_1x8_L64_SUB\r
+ MY_ALIGN\r
\r
- andi. L, T1, 31\r
- ble ZGEMM_L1x8_SAVE\r
\r
-ZGEMM_L1x8_SUB2:\r
- srawi. T1,L, 3\r
+ZGEMM_L1x8_SUB2_32:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 32\r
+ ble ZGEMM_L1x8_SUB2_16 \r
+ bl ZGEMM_1x8_L32_SUB\r
+ MY_ALIGN \r
+\r
+\r
+ZGEMM_L1x8_SUB2_16:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L1x8_SUB2_8\r
+ bl ZGEMM_1x8_L16_SUB \r
+ MY_ALIGN \r
+\r
+\r
+ZGEMM_L1x8_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
ble ZGEMM_L1x8_SUB2_4\r
- mtctr T1\r
- MY_ALIGN\r
-ZGEMM_L1x8_SUB2_LOOP:\r
- LOAD1x8 0 \r
- KERNEL1x8_L 128,16, 0,0\r
- KERNEL1x8_L 128,16, 1,0\r
- KERNEL1x8_L 128,16, 2,0\r
- KERNEL1x8_E 128,16, 3,1\r
- bdnz ZGEMM_L1x8_SUB2_LOOP \r
- MY_ALIGN \r
+ LOAD1x8_2\r
+ KERNEL1x8_L2 256,32, 0,0\r
+ KERNEL1x8_L2 256,32, 1,0\r
+ KERNEL1x8_L2 256,32, 2,0\r
+ KERNEL1x8_E2 256,32, 3,1\r
+ MY_ALIGN \r
+\r
+\r
ZGEMM_L1x8_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L1x8_SUB2_2\r
- LOAD1x8 0 \r
- KERNEL1x8_L 128,16, 0,0\r
- KERNEL1x8_E 128,16, 1,1\r
+ LOAD1x8_2\r
+ KERNEL1x8_L2 256,32, 0,0\r
+ KERNEL1x8_E2 256,32, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L1x8_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L1x8_SUB2_1\r
- LOAD1x8 0 \r
- KERNEL1x8_E 128,16, 0,1\r
+ LOAD1x8_2 \r
+ KERNEL1x8_E2 256,32, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x8_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L1x8_SAVE \r
- KERNEL1x8 \r
- \r
+ ble ZGEMM_L1x8_SAVE \r
+ KERNEL1x8\r
\r
-ZGEMM_L1x8_SAVE:\r
\r
- SAVE1x8\r
+ZGEMM_L1x8_SAVE:\r
+/*----------------------------------------*/ \r
+ addic. I, I, -1\r
+ SAVE1x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1\r
+#endif \r
+ bgt ZGEMM_L1x8_BEGIN\r
+ andi. T2, M, 7\r
+ ble ZGEMM_L1x1_END\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L1x4_END\r
+ b ZGEMM_L1x4_BEGIN\r
+ MY_ALIGN \r
\r
- addic. I, I, -1\r
- bgt ZGEMM_L1x8_BEGIN\r
\r
ZGEMM_L1x8_END:\r
+/*----------------------------------------*/ \r
+\r
\r
ZGEMM_L1x4_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T2, M, 7\r
+ ble ZGEMM_L1x1_END\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L1x4_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T11-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO1x4\r
+ ble ZGEMM_L1x4_SUB0 \r
+ bl ZGEMM_1x4_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x4_SAVE\r
+ b ZGEMM_L1x4_SUB2\r
+\r
\r
- andi. T2, M, 7\r
- ble ZGEMM_L1x1_END\r
-\r
- andi. T1, M, 4\r
- ble ZGEMM_L1x4_END\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 5 /**(K-1) % 16x */ \r
- ZERO1x4 \r
- ble ZGEMM_L1x4_SUB0 \r
-\r
-ZGEMM_L1x4_LOOP_START:\r
- LOAD1x4 0 \r
- mtctr L\r
-\r
- MY_ALIGN\r
-ZGEMM_L1x4_LOOP: \r
- KERNEL1x4_L 64,16,0,0\r
- KERNEL1x4_L 64,16,1,0 \r
- KERNEL1x4_L 64,16,2,0\r
- KERNEL1x4_L 64,16,3,0 \r
- KERNEL1x4_L 64,16,4,0\r
- KERNEL1x4_L 64,16,5,0 \r
- KERNEL1x4_L 64,16,6,0\r
- KERNEL1x4_L 64,16,7,0 \r
- KERNEL1x4_L 64,16,8,0\r
- KERNEL1x4_L 64,16,9,0\r
- KERNEL1x4_L 64,16,10,0\r
- KERNEL1x4_L 64,16,11,0 \r
- KERNEL1x4_L 64,16,12,0\r
- KERNEL1x4_L 64,16,13,0\r
- KERNEL1x4_L 64,16,14,0\r
- KERNEL1x4_L 64,16,15,1 \r
- bdnz ZGEMM_L1x4_LOOP\r
- MY_ALIGN \r
-ZGEMM_L1x4_LOOP_END:\r
- END1x4 AO, BO, 64,16 \r
- \r
- b ZGEMM_L1x4_SUB1\r
- \r
ZGEMM_L1x4_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP1x4_32K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-64 \r
+ LOAD1x4O 64,16 \r
+ END1x4_WITHOUT_ADD \r
+ LOAD1x4_2O 128, 32 \r
+ mtctr T8 \r
+ bl ZGEMM_L1x4_K32 \r
+ b ZGEMM_L1x4_SAVE \r
+ CMP1x4_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne ZGEMM_L1x4_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-128 \r
+ LOAD1x4_2O 128,32\r
+ bl ZGEMM_L1x4_K32 \r
+ b ZGEMM_L1x4_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
\r
- andi. L, K, 63\r
- \r
- b ZGEMM_L1x4_SUB2\r
\r
-ZGEMM_L1x4_SUB1:\r
+ZGEMM_L1x4_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L1x4_SUB2_8\r
+ bl ZGEMM_1x4_L16_SUB \r
+ MY_ALIGN\r
\r
- andi. L, T1, 31\r
- ble ZGEMM_L1x4_SAVE\r
\r
-ZGEMM_L1x4_SUB2:\r
- srawi. T1,L, 3\r
+ZGEMM_L1x4_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
ble ZGEMM_L1x4_SUB2_4\r
- mtctr T1\r
- MY_ALIGN\r
-ZGEMM_L1x4_SUB2_LOOP:\r
- LOAD1x4 0 \r
- KERNEL1x4_L 64,16, 0,0\r
- KERNEL1x4_L 64,16, 1,0\r
- KERNEL1x4_L 64,16, 2,0\r
- KERNEL1x4_E 64,16, 3,1\r
- bdnz ZGEMM_L1x4_SUB2_LOOP \r
+ bl ZGEMM_1x4_L8_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x4_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L1x4_SUB2_2\r
- LOAD1x4 0 \r
- KERNEL1x4_L 64,16, 0,0\r
- KERNEL1x4_E 64,16, 1,1\r
+ LOAD1x4_2\r
+ KERNEL1x4_L2 128,32, 0,0\r
+ KERNEL1x4_E2 128,32, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L1x4_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L1x4_SUB2_1\r
- LOAD1x4 0 \r
- KERNEL1x4_E 64,16, 0,1\r
+ LOAD1x4_2\r
+ KERNEL1x4_E2 128,32, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x4_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L1x4_SAVE \r
- KERNEL1x4 \r
+ ble ZGEMM_L1x4_SAVE \r
+ KERNEL1x4\r
+\r
\r
ZGEMM_L1x4_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE1x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1\r
+#endif \r
\r
- SAVE1x4\r
\r
ZGEMM_L1x4_END:\r
+/*----------------------------------------*/ \r
+\r
\r
ZGEMM_L1x2_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 2\r
+ ble ZGEMM_L1x2_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T11-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO1x2\r
+ ble ZGEMM_L1x2_SUB0 \r
+ bl ZGEMM_1x2_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x2_SAVE\r
+ b ZGEMM_L1x2_SUB2\r
\r
\r
- andi. T1, M, 2\r
- ble ZGEMM_L1x2_END\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 5 /**(K-1) % 16x */ \r
- ZERO1x2 \r
- ble ZGEMM_L1x2_SUB0 \r
-\r
-ZGEMM_L1x2_LOOP_START:\r
- LOAD1x2 0 \r
- mtctr L\r
-\r
- MY_ALIGN\r
-ZGEMM_L1x2_LOOP: \r
- KERNEL1x2_L 32,16,0,0\r
- KERNEL1x2_L 32,16,1,0 \r
- KERNEL1x2_L 32,16,2,0\r
- KERNEL1x2_L 32,16,3,0 \r
- KERNEL1x2_L 32,16,4,0\r
- KERNEL1x2_L 32,16,5,0 \r
- KERNEL1x2_L 32,16,6,0\r
- KERNEL1x2_L 32,16,7,0 \r
- KERNEL1x2_L 32,16,8,0\r
- KERNEL1x2_L 32,16,9,0\r
- KERNEL1x2_L 32,16,10,0\r
- KERNEL1x2_L 32,16,11,0 \r
- KERNEL1x2_L 32,16,12,0\r
- KERNEL1x2_L 32,16,13,0\r
- KERNEL1x2_L 32,16,14,0\r
- KERNEL1x2_L 32,16,15,1 \r
- bdnz ZGEMM_L1x2_LOOP\r
- MY_ALIGN \r
-ZGEMM_L1x2_LOOP_END:\r
- END1x2 AO, BO, 32,16 \r
- \r
- b ZGEMM_L1x2_SUB1\r
- \r
ZGEMM_L1x2_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP1x2_32K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-32 \r
+ LOAD1x2O 32,16 \r
+ END1x2_WITHOUT_ADD \r
+ LOAD1x2_2O 64, 32 \r
+ mtctr T8 \r
+ bl ZGEMM_L1x2_K32 \r
+ b ZGEMM_L1x2_SAVE \r
+ CMP1x2_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne ZGEMM_L1x2_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-64 \r
+ LOAD1x2_2O 64,32\r
+ bl ZGEMM_L1x2_K32 \r
+ b ZGEMM_L1x2_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
\r
- andi. L, K, 63\r
- \r
- b ZGEMM_L1x2_SUB2\r
\r
-ZGEMM_L1x2_SUB1:\r
+ZGEMM_L1x2_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L1x2_SUB2_8\r
+ bl ZGEMM_1x2_L16_SUB \r
+ MY_ALIGN\r
\r
- andi. L, T1, 31\r
- ble ZGEMM_L1x2_SAVE\r
\r
-ZGEMM_L1x2_SUB2:\r
- srawi. T1,L, 3\r
+ZGEMM_L1x2_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
ble ZGEMM_L1x2_SUB2_4\r
- mtctr T1\r
- MY_ALIGN\r
-ZGEMM_L1x2_SUB2_LOOP:\r
- LOAD1x2 0 \r
- KERNEL1x2_L 32,16, 0,0\r
- KERNEL1x2_L 32,16, 1,0\r
- KERNEL1x2_L 32,16, 2,0\r
- KERNEL1x2_E 32,16, 3,1\r
- bdnz ZGEMM_L1x2_SUB2_LOOP \r
+ bl ZGEMM_1x2_L8_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x2_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L1x2_SUB2_2\r
- LOAD1x2 0 \r
- KERNEL1x2_L 32,16, 0,0\r
- KERNEL1x2_E 32,16, 1,1\r
+ LOAD1x2_2\r
+ KERNEL1x2_L2 64,32, 0,0\r
+ KERNEL1x2_E2 64,32, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L1x2_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L1x2_SUB2_1\r
- LOAD1x2 0 \r
- KERNEL1x2_E 32,16, 0,1\r
+ LOAD1x2_2\r
+ KERNEL1x2_E2 64,32, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x2_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L1x2_SAVE \r
- KERNEL1x2 \r
+ ble ZGEMM_L1x2_SAVE \r
+ KERNEL1x2\r
+\r
+\r
ZGEMM_L1x2_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE1x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1\r
+#endif \r
\r
- SAVE1x2\r
\r
ZGEMM_L1x2_END:\r
+/*----------------------------------------*/ \r
+\r
\r
ZGEMM_L1x1_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 1\r
+ ble ZGEMM_L1x1_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T11-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO1x1\r
+ ble ZGEMM_L1x1_SUB0 \r
+ bl ZGEMM_1x1_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x1_SAVE\r
+ b ZGEMM_L1x1_SUB2\r
\r
\r
- andi. T1, M, 1\r
- ble ZGEMM_L1x1_END\r
- mr BO, B\r
- mr T1, K\r
- addi T1,T1, -1\r
- srawi. L, T1, 5 /**(K-1) % 16x */ \r
- ZERO1x1 \r
- ble ZGEMM_L1x1_SUB0 \r
-\r
-ZGEMM_L1x1_LOOP_START:\r
-\r
- LOAD1x1 0 \r
- mtctr L\r
-\r
- MY_ALIGN\r
-ZGEMM_L1x1_LOOP: \r
- KERNEL1x1_L 16,16,0,0\r
- KERNEL1x1_L 16,16,1,0 \r
- KERNEL1x1_L 16,16,2,0\r
- KERNEL1x1_L 16,16,3,0 \r
- KERNEL1x1_L 16,16,4,0\r
- KERNEL1x1_L 16,16,5,0 \r
- KERNEL1x1_L 16,16,6,0\r
- KERNEL1x1_L 16,16,7,0 \r
- KERNEL1x1_L 16,16,8,0\r
- KERNEL1x1_L 16,16,9,0\r
- KERNEL1x1_L 16,16,10,0\r
- KERNEL1x1_L 16,16,11,0 \r
- KERNEL1x1_L 16,16,12,0\r
- KERNEL1x1_L 16,16,13,0\r
- KERNEL1x1_L 16,16,14,0\r
- KERNEL1x1_L 16,16,15,1 \r
- bdnz ZGEMM_L1x1_LOOP\r
- MY_ALIGN \r
-ZGEMM_L1x1_LOOP_END:\r
- END1x1 AO, BO, 16, 16 \r
- \r
- b ZGEMM_L1x1_SUB1\r
- \r
ZGEMM_L1x1_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP1x1_32K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-16 \r
+ LOAD1x1O 16,16 \r
+ END1x1_WITHOUT_ADD \r
+ LOAD1x1_2O 32, 32 \r
+ mtctr T8 \r
+ bl ZGEMM_L1x1_K32 \r
+ b ZGEMM_L1x1_SAVE \r
+ CMP1x1_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne ZGEMM_L1x1_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-32 \r
+ LOAD1x1_2O 32,32\r
+ bl ZGEMM_L1x1_K32 \r
+ b ZGEMM_L1x1_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
\r
- andi. L, K, 63\r
- \r
- b ZGEMM_L1x1_SUB2\r
\r
-ZGEMM_L1x1_SUB1:\r
+ZGEMM_L1x1_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble ZGEMM_L1x1_SUB2_8\r
+ bl ZGEMM_1x1_L16_SUB \r
+ MY_ALIGN\r
\r
- andi. L, T1, 31\r
- ble ZGEMM_L1x1_SAVE\r
\r
-ZGEMM_L1x1_SUB2:\r
- srawi. T1,L, 3\r
+ZGEMM_L1x1_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
ble ZGEMM_L1x1_SUB2_4\r
- mtctr T1\r
- MY_ALIGN\r
-ZGEMM_L1x1_SUB2_LOOP:\r
- LOAD1x1 0 \r
- KERNEL1x1_L 16,16, 0,0\r
- KERNEL1x1_L 16,16, 1,0\r
- KERNEL1x1_L 16,16, 2,0\r
- KERNEL1x1_E 16,16, 3,1\r
- bdnz ZGEMM_L1x1_SUB2_LOOP \r
+ bl ZGEMM_1x1_L8_SUB\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x1_SUB2_4:\r
+/*----------------------------------------*/ \r
andi. T1,L, 4\r
ble ZGEMM_L1x1_SUB2_2\r
- LOAD1x1 0 \r
- KERNEL1x1_L 16,16, 0,0\r
- KERNEL1x1_E 16,16, 1,1\r
+ LOAD1x1_2\r
+ KERNEL1x1_L2 32,32, 0,0\r
+ KERNEL1x1_E2 32,32, 1,1\r
MY_ALIGN\r
+\r
+\r
ZGEMM_L1x1_SUB2_2:\r
+/*----------------------------------------*/ \r
andi. T1,L, 2\r
ble ZGEMM_L1x1_SUB2_1\r
- LOAD1x1 0 \r
- KERNEL1x1_E 16,16, 0,1\r
+ LOAD1x1_2\r
+ KERNEL1x1_E2 32,32, 0,1\r
MY_ALIGN \r
+\r
+\r
ZGEMM_L1x1_SUB2_1:\r
+/*----------------------------------------*/ \r
andi. T1,L, 1\r
- ble ZGEMM_L1x1_SAVE \r
- KERNEL1x1 \r
+ ble ZGEMM_L1x1_SAVE \r
+ KERNEL1x1\r
+\r
\r
ZGEMM_L1x1_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE1x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1\r
+#endif \r
\r
- SAVE1x1\r
\r
ZGEMM_L1x1_END:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ addi TEMP_REG, TEMP_REG, 1\r
+#endif \r
+\r
\r
ZGEMM_L1_END:\r
+/*----------------------------------------*/ \r
+
\ No newline at end of file
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-\r
#define unit_size 16\r
#define DISP32(ind,disp) (ind*unit_size*32+disp)\r
#define DISP16(ind,disp) (ind*unit_size*16+disp)\r
#define DISP2(ind,disp) (ind*unit_size*2+disp)\r
#define DISP1(ind,disp) (ind*unit_size+disp)\r
#define DISPX(disp) (disp)\r
-\r
/* HELPERS FOR SAVE */\r
-\r
/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */\r
+\r
+\r
.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET \r
#ifndef TRMMKERNEL \r
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)\r
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 \r
#endif \r
.endm\r
-\r
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/\r
+\r
+\r
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2\r
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/\r
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/\r
.endm \r
-\r
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/\r
+\r
+\r
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 \r
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */\r
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/\r
.endm\r
-\r
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/\r
+\r
+\r
.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI\r
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) \r
xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
#endif\r
.endm \r
-\r
/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */\r
+\r
+\r
.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2\r
#ifndef TRMMKERNEL \r
xvmsubadp \VSOUT1,\VSINII, alpha_i\r
xvmuldp \VSOUT2,\VSINRR, alpha_i\r
#endif \r
.endm\r
-\r
/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */\r
+\r
+\r
.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 \r
xvmsubadp \VSOUT1,\VSINRR, alpha_r\r
xvmaddadp \VSOUT2,\VSINII, alpha_r\r
.endm\r
-\r
/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */\r
+\r
+\r
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 \r
xxmrghd \VSOUT1,\VSIN2,\VSIN1\r
xxmrgld \VSOUT2,\VSIN2,\VSIN1\r
.endm\r
+\r
+\r
.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2\r
stxv \VSIN1, DISPX(\LOFFSET)(\REG)\r
stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)\r
.endm\r
\r
+\r
.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET\r
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3\r
LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET\r
STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3\r
.endm\r
\r
+\r
.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET\r
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3\r
LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET\r
.endm\r
\r
\r
+\r
.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET\r
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3\r
LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET\r
.endm\r
\r
\r
+\r
.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET\r
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3\r
#ifndef TRMMKERNEL \r
xxmrghd vs7,vs15,vs14 \r
stxv vs7, (\LOFFSET)(\BASE_REG) \r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=2 and M=8\r
+*\r
+\r
+.macros for N=2 and M=8\r
**********************************************************************************************/\r
\r
.macro Zero2x8\r
xxlxor vs63, vs63, vs63\r
.endm\r
\r
-.macro LOAD2x8 Zero\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- lxv vs18, 16(BO) // load real,imag from B\r
+.macro LOAD2x8 \r
+ LOAD2x8O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD2x8O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B \r
xxswapd vs17, vs16\r
xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A\r
+ \r
+.endm\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
\r
- lxv vs4, 64(AO) // load real,imag from A\r
- lxv vs5, 80(AO) // load real,imag from A\r
- lxv vs6, 96(AO) // load real,imag from A\r
- lxv vs7, 112(AO) // load real,imag from A\r
+.macro END2x8_NORMAL\r
+ END2x8 AO,BO,128,32\r
+.endm\r
\r
-.if \Zero==1\r
- Zero2x8\r
-.endif\r
\r
+.macro END2x8_WITHOUT_ADD\r
+ END2x8 AO,BO,0,0\r
.endm\r
\r
-.macro END2x8_NORMAL\r
- END2x8 AO,BO,128,32\r
-.endm\r
\r
.macro END2x8 AREG, BREG, OffsetA, OffsetB\r
-\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
.if \OffsetA != 0\r
addi \AREG, \AREG, \OffsetA\r
.endif\r
-\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs48, vs0, vs18\r
-\r
+ xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs49, vs0, vs19\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs50, vs1, vs18\r
-\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs51, vs1, vs19\r
xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs52, vs2, vs18\r
-\r
+ xvmaddadp vs37, vs2, vs17\r
+ xvmaddadp vs53, vs2, vs19\r
xvmaddadp vs38, vs3, vs16\r
xvmaddadp vs54, vs3, vs18\r
-\r
+ xvmaddadp vs39, vs3, vs17\r
+ xvmaddadp vs55, vs3, vs19\r
xvmaddadp vs40, vs4, vs16\r
xvmaddadp vs56, vs4, vs18\r
-\r
+ xvmaddadp vs41, vs4, vs17\r
+ xvmaddadp vs57, vs4, vs19\r
xvmaddadp vs42, vs5, vs16\r
xvmaddadp vs58, vs5, vs18\r
-\r
+ xvmaddadp vs43, vs5, vs17\r
+ xvmaddadp vs59, vs5, vs19\r
xvmaddadp vs44, vs6, vs16\r
xvmaddadp vs60, vs6, vs18\r
-\r
+ xvmaddadp vs45, vs6, vs17\r
+ xvmaddadp vs61, vs6, vs19\r
xvmaddadp vs46, vs7, vs16\r
xvmaddadp vs62, vs7, vs18\r
+ xvmaddadp vs47, vs7, vs17\r
+ xvmaddadp vs63, vs7, vs19\r
+.endm\r
\r
\r
- xvmaddadp vs33, vs0, vs17\r
- xvmaddadp vs49, vs0, vs19\r
-\r
- xvmaddadp vs35, vs1, vs17\r
- xvmaddadp vs51, vs1, vs19\r
-\r
- xvmaddadp vs37, vs2, vs17\r
- xvmaddadp vs53, vs2, vs19\r
-\r
- xvmaddadp vs39, vs3, vs17\r
- xvmaddadp vs55, vs3, vs19\r
-\r
- xvmaddadp vs41, vs4, vs17\r
- xvmaddadp vs57, vs4, vs19\r
+.macro LOAD2x8_2\r
+ LOAD2x8_2O 0,0\r
+.endm \r
\r
- xvmaddadp vs43, vs5, vs17\r
- xvmaddadp vs59, vs5, vs19\r
\r
- xvmaddadp vs45, vs6, vs17\r
- xvmaddadp vs61, vs6, vs19\r
+.macro LOAD2x8_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B\r
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B\r
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A\r
+.endm \r
+\r
+\r
+.macro END2x8_2 \r
+ /*for load2 offset will be 256 and 64*/\r
+ KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 \r
+.endm\r
+ \r
\r
- xvmaddadp vs47, vs7, vs17\r
- xvmaddadp vs63, vs7, vs19\r
\r
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
\r
-.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
.endm\r
\r
\r
.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
-\r
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs48, vs0, vs18\r
xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs49, vs0, vs19\r
-\r
- xxswapd vs21, vs20\r
- xxswapd vs23, vs22\r
-\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs50, vs1, vs18\r
-\r
- lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
xvmaddadp vs35, vs1, vs17\r
xvmaddadp vs51, vs1, vs19\r
-\r
- lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
+.if \Complete==0 \r
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs52, vs2, vs18\r
-\r
- lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
xvmaddadp vs37, vs2, vs17\r
xvmaddadp vs53, vs2, vs19\r
-\r
- lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
-.if \IsLast==1\r
-.if \Complete==1 \r
- addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
-.endif\r
-.endif\r
-\r
-\r
xvmaddadp vs38, vs3, vs16\r
xvmaddadp vs54, vs3, vs18\r
-\r
-.if \Complete==0\r
- lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
-.endif\r
-\r
-\r
xvmaddadp vs39, vs3, vs17\r
xvmaddadp vs55, vs3, vs19\r
-\r
-.if \Complete==0\r
- lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
-.endif\r
+.if \Complete==0 \r
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs40, vs4, vs16\r
xvmaddadp vs56, vs4, vs18\r
-\r
xvmaddadp vs41, vs4, vs17\r
xvmaddadp vs57, vs4, vs19\r
-\r
xvmaddadp vs42, vs5, vs16\r
xvmaddadp vs58, vs5, vs18\r
xvmaddadp vs43, vs5, vs17\r
xvmaddadp vs59, vs5, vs19\r
-\r
-.if \Complete==0\r
- lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
-.endif\r
-\r
+.if \Complete==0 \r
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs44, vs6, vs16\r
xvmaddadp vs60, vs6, vs18\r
xvmaddadp vs45, vs6, vs17\r
xvmaddadp vs61, vs6, vs19\r
-\r
xvmaddadp vs46, vs7, vs16\r
xvmaddadp vs62, vs7, vs18\r
xvmaddadp vs47, vs7, vs17\r
- xvmaddadp vs63, vs7, vs19\r
-\r
-.if \Complete==0\r
- lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs63, vs7, vs19 \r
+.if \Complete==0 \r
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
.endif\r
-\r
xvmaddadp vs32, vs8, vs20\r
xvmaddadp vs48, vs8, vs22\r
.if \Complete==0\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
- lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
-.endif\r
-.if \Complete==0\r
-.if \IsLast==1 \r
- addi \AREG, \AREG, DISP16(\Index,256)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
-.endif\r
-\r
-.endif\r
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A \r
+.endif \r
xvmaddadp vs33, vs8, vs21\r
xvmaddadp vs49, vs8, vs23\r
-\r
-.if \Complete==0\r
- xxswapd vs17, vs16\r
- xxswapd vs19, vs18\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
.endif\r
-\r
xvmaddadp vs34, vs9, vs20\r
xvmaddadp vs50, vs9, vs22\r
xvmaddadp vs35, vs9, vs21\r
xvmaddadp vs51, vs9, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
xvmaddadp vs36, vs10, vs20\r
xvmaddadp vs52, vs10, vs22\r
xvmaddadp vs37, vs10, vs21\r
xvmaddadp vs53, vs10, vs23\r
-\r
xvmaddadp vs38, vs11, vs20\r
xvmaddadp vs54, vs11, vs22\r
xvmaddadp vs39, vs11, vs21\r
xvmaddadp vs55, vs11, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs40, vs12, vs20\r
xvmaddadp vs56, vs12, vs22\r
xvmaddadp vs41, vs12, vs21\r
xvmaddadp vs57, vs12, vs23\r
-\r
xvmaddadp vs42, vs13, vs20\r
xvmaddadp vs58, vs13, vs22\r
xvmaddadp vs43, vs13, vs21\r
xvmaddadp vs59, vs13, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs44, vs14, vs20\r
xvmaddadp vs60, vs14, vs22\r
xvmaddadp vs45, vs14, vs21\r
xvmaddadp vs61, vs14, vs23\r
-\r
xvmaddadp vs46, vs15, vs20\r
xvmaddadp vs62, vs15, vs22\r
xvmaddadp vs47, vs15, vs21\r
xvmaddadp vs63, vs15, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP16(\Index,256)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
.endm\r
\r
+ \r
+\r
+\r
+\r
.macro KERNEL2x8\r
- LOAD2x8 0\r
+ LOAD2x8\r
END2x8 AO, BO, 128,32\r
.endm\r
\r
-.macro SAVE2x8\r
\r
+.macro SAVE2x8\r
add T1, CO ,LDC \r
SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0\r
SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 \r
addi CO, CO, 128\r
-\r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=2 and M=4\r
+*\r
+\r
+.macros for N=2 and M=4\r
**********************************************************************************************/\r
\r
+\r
.macro Zero2x4\r
xxlxor vs32, vs32, vs32\r
xxlxor vs33, vs33, vs33\r
xxlxor vs47, vs47, vs47\r
.endm\r
\r
-.macro LOAD2x4 Zero\r
-\r
- lxv vs16, 0(BO) // load real imag from B\r
- lxv vs18, 16(BO) // load real,imag from B\r
- xxswapd vs17, vs16\r
- xxswapd vs19, vs18\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
+.macro LOAD2x4 \r
+ LOAD2x4O 0,0 \r
+.endm\r
\r
-.if \Zero==1\r
- Zero2x4\r
-.endif\r
\r
+.macro LOAD2x4O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A \r
.endm\r
\r
+\r
.macro END2x4_NORMAL\r
END2x4 AO,BO,64,32\r
.endm\r
\r
-.macro END2x4 AREG, BREG, OffsetA, OffsetB\r
\r
+.macro END2x4_WITHOUT_ADD\r
+ END2x4 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x4 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
.if \OffsetA != 0\r
addi \AREG, \AREG, \OffsetA\r
.endif\r
-\r
xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs40, vs0, vs18\r
+ xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs41, vs0, vs19\r
-\r
xvmaddadp vs34, vs1, vs16\r
- xvmaddadp vs35, vs1, vs17\r
xvmaddadp vs42, vs1, vs18\r
+ xvmaddadp vs35, vs1, vs17\r
xvmaddadp vs43, vs1, vs19\r
- \r
xvmaddadp vs36, vs2, vs16\r
- xvmaddadp vs37, vs2, vs17\r
xvmaddadp vs44, vs2, vs18\r
+ xvmaddadp vs37, vs2, vs17\r
xvmaddadp vs45, vs2, vs19\r
-\r
xvmaddadp vs38, vs3, vs16\r
- xvmaddadp vs39, vs3, vs17\r
xvmaddadp vs46, vs3, vs18\r
+ xvmaddadp vs39, vs3, vs17\r
xvmaddadp vs47, vs3, vs19\r
\r
.endm\r
\r
-.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+\r
+.macro LOAD2x4_2\r
+ LOAD2x4_2O 0,0\r
+.endm \r
+\r
+\r
+.macro LOAD2x4_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B\r
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B\r
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A \r
+.endm \r
+\r
+\r
+.macro END2x4_2 \r
+ /*for load2 offset will be 128 and 64*/\r
+ KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 \r
+.endm\r
+ \r
+\r
+\r
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+\r
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
.endm\r
\r
-.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
- \r
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
- xxswapd vs21, vs20\r
- xxswapd vs23, vs22 \r
- lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
xvmaddadp vs40, vs0, vs18\r
+ xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs41, vs0, vs19\r
- lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
-.if \IsLast==1\r
-.if \Complete==1\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) \r
- addi \AREG, \AREG, DISP8(\Index,64+\OffsetA) \r
-.endif\r
-.endif\r
-\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22\r
xvmaddadp vs34, vs1, vs16\r
- xvmaddadp vs35, vs1, vs17\r
xvmaddadp vs42, vs1, vs18\r
+ xvmaddadp vs35, vs1, vs17\r
xvmaddadp vs43, vs1, vs19\r
- \r
- xvmaddadp vs36, vs2, vs16\r
- xvmaddadp vs37, vs2, vs17\r
-.if \Complete==0\r
- lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.if \Complete==0 \r
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
.endif \r
+ xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs44, vs2, vs18\r
+ xvmaddadp vs37, vs2, vs17\r
xvmaddadp vs45, vs2, vs19\r
- \r
xvmaddadp vs38, vs3, vs16\r
- xvmaddadp vs39, vs3, vs17\r
xvmaddadp vs46, vs3, vs18\r
+ xvmaddadp vs39, vs3, vs17\r
xvmaddadp vs47, vs3, vs19\r
-\r
-\r
.if \Complete==0 \r
- lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ \r
+.if \Complete==0 \r
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
.endif\r
xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs40, vs8, vs22 \r
xvmaddadp vs33, vs8, vs21\r
-.if \Complete==0\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
- lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
-.if \IsLast==1 \r
- addi \AREG, \AREG, DISP8(\Index,128)\r
- addi \BREG, \BREG, DISP4(\Index,64) \r
-.endif \r
-.endif\r
-\r
-.if \Complete==0\r
- xxswapd vs17, vs16\r
- xxswapd vs19, vs18\r
-.endif\r
- \r
- xvmaddadp vs40, vs8, vs22\r
xvmaddadp vs41, vs8, vs23\r
-\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif\r
xvmaddadp vs34, vs9, vs20\r
- xvmaddadp vs35, vs9, vs21\r
xvmaddadp vs42, vs9, vs22\r
+ xvmaddadp vs35, vs9, vs21\r
xvmaddadp vs43, vs9, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
xvmaddadp vs36, vs10, vs20\r
- xvmaddadp vs37, vs10, vs21\r
xvmaddadp vs44, vs10, vs22\r
+ xvmaddadp vs37, vs10, vs21\r
xvmaddadp vs45, vs10, vs23\r
-\r
xvmaddadp vs38, vs11, vs20\r
- xvmaddadp vs39, vs11, vs21\r
xvmaddadp vs46, vs11, vs22\r
+ xvmaddadp vs39, vs11, vs21\r
xvmaddadp vs47, vs11, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ \r
+.if \Complete==0 \r
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP8(\Index,128)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
.endm\r
+ \r
+\r
\r
.macro KERNEL2x4\r
- LOAD2x4 0\r
+ LOAD2x4\r
END2x4 AO, BO, 64,32\r
.endm\r
\r
+\r
+\r
.macro SAVE2x4 \r
add T1, CO ,LDC \r
SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0\r
SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 \r
addi CO, CO, 64\r
-\r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=2 and M=2\r
+*\r
+\r
+.macros for N=2 and M=2\r
**********************************************************************************************/\r
\r
+\r
.macro Zero2x2\r
xxlxor vs32, vs32, vs32\r
xxlxor vs33, vs33, vs33\r
xxlxor vs37, vs37, vs37\r
xxlxor vs38, vs38, vs38\r
xxlxor vs39, vs39, vs39\r
-.endm\r
\r
-.macro LOAD2x2 Zero\r
+.endm\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- lxv vs18, 16(BO) // load real,imag from B\r
- xxswapd vs17, vs16\r
- xxswapd vs19, vs18\r
\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
+.macro LOAD2x2 \r
+ LOAD2x2O 0,0 \r
+.endm\r
\r
\r
-.if \Zero==1\r
- Zero2x2\r
-.endif \r
+.macro LOAD2x2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ \r
.endm\r
\r
+\r
.macro END2x2_NORMAL\r
END2x2 AO,BO,32,32\r
.endm\r
\r
-.macro END2x2 AREG, BREG, OffsetA, OffsetB\r
\r
+.macro END2x2_WITHOUT_ADD\r
+ END2x2 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x2 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
.if \OffsetA != 0\r
addi \AREG, \AREG, \OffsetA\r
.endif\r
-\r
xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs36, vs0, vs18\r
+ xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs37, vs0, vs19\r
-\r
xvmaddadp vs34, vs1, vs16\r
- xvmaddadp vs35, vs1, vs17 \r
xvmaddadp vs38, vs1, vs18\r
- xvmaddadp vs39, vs1, vs19\r
+ xvmaddadp vs35, vs1, vs17\r
+ xvmaddadp vs39, vs1, vs19 \r
\r
.endm\r
\r
-.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+\r
+.macro LOAD2x2_2\r
+ LOAD2x2_2O 0,0\r
+.endm \r
+\r
+\r
+.macro LOAD2x2_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B\r
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B\r
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A \r
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A\r
+ \r
+.endm \r
+\r
+\r
+.macro END2x2_2 \r
+ /*for load2 offset will be 64 and 64*/\r
+ KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 \r
.endm\r
+ \r
+\r
\r
-.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
- xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
- xxswapd vs21, vs20\r
- xxswapd vs23, vs22\r
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
\r
- lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
-.if \IsLast==1\r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) \r
-.endif\r
-.endif \r
+\r
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs36, vs0, vs18\r
+ xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs37, vs0, vs19\r
-\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22\r
xvmaddadp vs34, vs1, vs16\r
- xvmaddadp vs35, vs1, vs17 \r
xvmaddadp vs38, vs1, vs18\r
+ xvmaddadp vs35, vs1, vs17\r
xvmaddadp vs39, vs1, vs19\r
-\r
-.if \Complete==0\r
- lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A\r
-.endif\r
-.if \Complete==0\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
- lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
-.if \IsLast==1 \r
- addi \AREG, \AREG, DISP4(\Index,64)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
-.endif \r
+.if \Complete==0 \r
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+.if \Complete==0 \r
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
.endif\r
-\r
xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs36, vs8, vs22 \r
xvmaddadp vs33, vs8, vs21\r
-\r
-.if \Complete==0\r
- xxswapd vs17, vs16\r
- xxswapd vs19, vs18\r
-.endif \r
- xvmaddadp vs36, vs8, vs22\r
xvmaddadp vs37, vs8, vs23\r
-\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif\r
xvmaddadp vs34, vs9, vs20\r
- xvmaddadp vs35, vs9, vs21 \r
-\r
xvmaddadp vs38, vs9, vs22\r
+ xvmaddadp vs35, vs9, vs21\r
xvmaddadp vs39, vs9, vs23\r
+.if \Complete==0 \r
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \Complete==0 \r
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
+ \r
+ \r
\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP4(\Index,64)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
.endm\r
+ \r
+\r
\r
.macro KERNEL2x2\r
- LOAD2x2 0\r
+ LOAD2x2\r
END2x2 AO, BO, 32,32\r
.endm\r
\r
+\r
+\r
.macro SAVE2x2 \r
add T1, CO ,LDC \r
SAVE2 vs32,vs33,vs34,vs35,CO,0\r
SAVE2 vs36,vs37,vs38,vs39,T1,0 \r
addi CO, CO, 32 \r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=2 and M=1\r
+*\r
+\r
+.macros for N=2 and M=1\r
**********************************************************************************************/\r
\r
+\r
+\r
.macro Zero2x1\r
xxlxor vs32, vs32, vs32\r
xxlxor vs33, vs33, vs33\r
xxlxor vs34, vs34, vs34\r
xxlxor vs35, vs35, vs35\r
+ \r
.endm\r
\r
-.macro LOAD2x1 Zero\r
- lxv vs0, 0(AO) // load real,imag from A\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- lxv vs18, 16(BO) // load real,imag from B\r
+.macro LOAD2x1 \r
+ LOAD2x1O 0,0 \r
+.endm\r
\r
+\r
+.macro LOAD2x1O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B \r
xxswapd vs17, vs16\r
xxswapd vs19, vs18\r
-.if \Zero==1\r
- Zero2x1\r
-.endif \r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A \r
.endm\r
\r
+\r
.macro END2x1_NORMAL\r
END2x1 AO,BO,16,32\r
.endm\r
\r
-.macro END2x1 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetA != 0\r
- addi \AREG, \AREG, \OffsetA\r
-.endif\r
+.macro END2x1_WITHOUT_ADD\r
+ END2x1 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x1 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
-\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
xvmaddadp vs32, vs0, vs16\r
+ xvmaddadp vs34, vs0, vs18\r
xvmaddadp vs33, vs0, vs17\r
+ xvmaddadp vs35, vs0, vs19 \r
+.endm\r
\r
- xvmaddadp vs34, vs0, vs18\r
- xvmaddadp vs35, vs0, vs19\r
\r
-.endm\r
+.macro LOAD2x1_2\r
+ LOAD2x1_2O 0,0\r
+.endm \r
\r
-.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
-.endm\r
\r
-.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro LOAD2x1_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs18, (\OffsetB+16)(BO) // load real,imag from B\r
+ lxv vs20, (\OffsetB+32)(BO) // load real,imag from B\r
+ lxv vs22, (\OffsetB+48)(BO) // load real,imag from B \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A \r
+.endm \r
+\r
+\r
+.macro END2x1_2 \r
+ /*for load2 offset will be 32 and 64*/\r
+ KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 \r
.endm\r
+ \r
\r
-.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- lxv vs22, DISP4(\Index,16+\OffsetB)(\BREG) // load real,imag from B\r
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
\r
- lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
\r
- xxswapd vs21, vs20\r
- xxswapd vs23, vs22\r
-.if \IsLast==1\r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
- addi \BREG, \BREG, DISP4(\Index,32+\OffsetB) \r
-.endif\r
-.endif\r
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
\r
- xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
\r
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xxswapd vs21, vs20\r
+ xxswapd vs23, vs22 \r
+ xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs34, vs0, vs18\r
+ xvmaddadp vs33, vs0, vs17\r
xvmaddadp vs35, vs0, vs19\r
-\r
-.if \Complete==0\r
- lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
-.endif\r
-.if \Complete==0\r
- lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real imag from B\r
- lxv vs18, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
-.if \IsLast==1 \r
- addi \AREG, \AREG, DISP2(\Index,32)\r
- addi \BREG, \BREG, DISP4(\Index,64)\r
-.endif \r
-.endif\r
- \r
-.if \Complete==0\r
- xxswapd vs17, vs16\r
- xxswapd vs19, vs18\r
+.if \Complete==0 \r
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
+ lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
.endif\r
-\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+ xxswapd vs19, vs18\r
+.endif \r
xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs34, vs8, vs22 \r
xvmaddadp vs33, vs8, vs21\r
-\r
- xvmaddadp vs34, vs8, vs22\r
xvmaddadp vs35, vs8, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A \r
+.endif\r
+ \r
+.if \Complete==0 \r
+ lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B\r
+ lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP2(\Index,32)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
.endm\r
+ \r
+\r
\r
.macro KERNEL2x1\r
- LOAD2x1 0\r
+ LOAD2x1\r
END2x1 AO, BO, 16,32\r
.endm\r
\r
+\r
+\r
.macro SAVE2x1\r
add T1, CO ,LDC \r
SAVE1 vs32,vs33,CO,0\r
.endm\r
\r
/**********************************************************************************************\r
-* Macros for N=1 and M=8\r
+*\r
+\r
+.macros for N=1 and M=8\r
**********************************************************************************************/\r
+\r
+\r
.macro Zero1x8\r
xxlxor vs32, vs32, vs32\r
xxlxor vs33, vs33, vs33\r
xxlxor vs45, vs45, vs45\r
xxlxor vs46, vs46, vs46\r
xxlxor vs47, vs47, vs47\r
+ xxlxor vs48, vs48, vs48\r
.endm\r
\r
-.macro LOAD1x8 Zero\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- xxswapd vs17, vs16\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
-\r
- lxv vs4, 64(AO) // load real,imag from A\r
- lxv vs5, 80(AO) // load real,imag from A\r
- lxv vs6, 96(AO) // load real,imag from A\r
- lxv vs7, 112(AO) // load real,imag from A\r
-\r
-.if \Zero==1\r
- Zero1x8\r
-.endif\r
+.macro LOAD1x8 \r
+ LOAD1x8O 0,0 \r
+.endm\r
+\r
\r
+.macro LOAD1x8O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B \r
+ xxswapd vs17, vs16 \r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A\r
+ \r
.endm\r
\r
+\r
.macro END1x8_NORMAL\r
END1x8 AO,BO,128,16\r
.endm\r
\r
-.macro END1x8 AREG, BREG, OffsetA, OffsetB\r
\r
+.macro END1x8_WITHOUT_ADD\r
+ END1x8 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x8 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
.if \OffsetA != 0\r
addi \AREG, \AREG, \OffsetA\r
.endif\r
-\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs33, vs0, vs17\r
+\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs35, vs1, vs17\r
+\r
xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs37, vs2, vs17\r
+\r
xvmaddadp vs38, vs3, vs16\r
xvmaddadp vs39, vs3, vs17\r
+\r
xvmaddadp vs40, vs4, vs16\r
xvmaddadp vs41, vs4, vs17\r
+\r
xvmaddadp vs42, vs5, vs16\r
xvmaddadp vs43, vs5, vs17\r
+\r
xvmaddadp vs44, vs6, vs16\r
xvmaddadp vs45, vs6, vs17\r
+\r
xvmaddadp vs46, vs7, vs16\r
xvmaddadp vs47, vs7, vs17\r
\r
.endm\r
\r
-.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+\r
+.macro LOAD1x8_2\r
+ LOAD1x8_2O 0,0\r
+.endm \r
+\r
+\r
+.macro LOAD1x8_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs4, (64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs5, (80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs6, (96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs7, (112+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A\r
+.endm \r
+\r
+\r
+.macro END1x8_2 \r
+ /*for load2 offset will be 256 and 32*/\r
+ KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 \r
.endm\r
+ \r
+\r
\r
-.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- xxswapd vs21, vs20\r
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
\r
\r
- lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17 \r
- lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+ xvmaddadp vs33, vs0, vs17\r
+ xxswapd vs21, vs20\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs35, vs1, vs17\r
- lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.if \Complete==0 \r
+ lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs37, vs2, vs17\r
- lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
\r
xvmaddadp vs38, vs3, vs16\r
xvmaddadp vs39, vs3, vs17\r
-.if \Complete==0\r
- lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.if \Complete==0 \r
+ lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
.endif \r
xvmaddadp vs40, vs4, vs16\r
xvmaddadp vs41, vs4, vs17\r
-.if \Complete==0 \r
- lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
-.endif \r
+\r
xvmaddadp vs42, vs5, vs16\r
xvmaddadp vs43, vs5, vs17\r
+.if \Complete==0 \r
+ lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs44, vs6, vs16\r
xvmaddadp vs45, vs6, vs17\r
-.if \Complete==0\r
- lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
-.endif \r
+\r
xvmaddadp vs46, vs7, vs16\r
xvmaddadp vs47, vs7, vs17\r
-\r
-\r
+.if \Complete==0 \r
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
+.endif\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+.endif\r
xvmaddadp vs32, vs8, vs20\r
xvmaddadp vs33, vs8, vs21\r
-.if \Complete==0 \r
- lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A \r
-.endif\r
+.if \Complete==0\r
+ lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A \r
+.endif \r
xvmaddadp vs34, vs9, vs20\r
xvmaddadp vs35, vs9, vs21\r
-.if \Complete==0\r
- lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
- xxswapd vs17,vs16\r
-.endif\r
-.if \IsLast==1\r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
- addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
-.else\r
- addi \AREG, \AREG, DISP16(\Index,256)\r
- addi \BREG, \BREG, DISP2(\Index,32)\r
-.endif\r
+.if \Complete==0 \r
+ lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
.endif\r
xvmaddadp vs36, vs10, vs20\r
xvmaddadp vs37, vs10, vs21\r
-\r
xvmaddadp vs38, vs11, vs20\r
xvmaddadp vs39, vs11, vs21\r
-\r
+.if \Complete==0 \r
+ lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs40, vs12, vs20\r
xvmaddadp vs41, vs12, vs21\r
xvmaddadp vs42, vs13, vs20\r
xvmaddadp vs43, vs13, vs21\r
+.if \Complete==0 \r
+ lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs44, vs14, vs20\r
xvmaddadp vs45, vs14, vs21\r
xvmaddadp vs46, vs15, vs20\r
xvmaddadp vs47, vs15, vs21\r
-\r
+.if \Complete==0 \r
+ lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP16(\Index,256)\r
+ addi \BREG, \BREG, DISP2(\Index,32)\r
+.endif\r
+.endif \r
.endm\r
\r
+ \r
+\r
+\r
+\r
.macro KERNEL1x8\r
- LOAD1x8 0\r
+ LOAD1x8\r
END1x8 AO, BO, 128,16\r
.endm\r
\r
-.macro SAVE1x8\r
\r
- SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 \r
+.macro SAVE1x8\r
+ SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0\r
addi CO, CO, 128\r
-\r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=1 and M=4\r
+*\r
+\r
+.macros for N=2 and M=4\r
**********************************************************************************************/\r
\r
+\r
.macro Zero1x4\r
xxlxor vs32, vs32, vs32\r
xxlxor vs33, vs33, vs33\r
xxlxor vs39, vs39, vs39\r
.endm\r
\r
-.macro LOAD1x4 Zero\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- xxswapd vs17,vs16\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- lxv vs2, 32(AO) // load real,imag from A\r
- lxv vs3, 48(AO) // load real,imag from A\r
+.macro LOAD1x4 \r
+ LOAD1x4O 0,0 \r
+.endm\r
\r
-.if \Zero==1\r
- Zero1x4\r
-.endif\r
\r
+.macro LOAD1x4O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ xxswapd vs17, vs16\r
+\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A \r
+ \r
.endm\r
\r
+\r
.macro END1x4_NORMAL\r
END1x4 AO,BO,64,16\r
.endm\r
\r
-.macro END1x4 AREG, BREG, OffsetA, OffsetB\r
\r
+.macro END1x4_WITHOUT_ADD\r
+ END1x4 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x4 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
.if \OffsetA != 0\r
addi \AREG, \AREG, \OffsetA\r
.endif\r
-\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs33, vs0, vs17\r
+\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs35, vs1, vs17\r
+\r
xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs37, vs2, vs17\r
+\r
xvmaddadp vs38, vs3, vs16\r
xvmaddadp vs39, vs3, vs17\r
\r
.endm\r
\r
-.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+\r
+.macro LOAD1x4_2\r
+ LOAD1x4_2O 0,0\r
+.endm \r
+\r
+\r
+.macro LOAD1x4_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs2, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs3, (48+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (64+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs9, (80+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs10, (96+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs11, (112+\OffsetA)(AO) // load real,imag from A \r
+.endm \r
+\r
+\r
+.macro END1x4_2 \r
+ /*for load2 offset will be 128 and 32*/\r
+ KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 \r
.endm\r
+ \r
+\r
\r
-.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- xxswapd vs21,vs20\r
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
\r
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs33, vs0, vs17\r
-\r
- lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ xxswapd vs21, vs20\r
xvmaddadp vs34, vs1, vs16\r
- xvmaddadp vs35, vs1, vs17 \r
- lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
+ xvmaddadp vs35, vs1, vs17\r
+.if \Complete==0 \r
+ lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
xvmaddadp vs36, vs2, vs16\r
xvmaddadp vs37, vs2, vs17\r
+\r
xvmaddadp vs38, vs3, vs16\r
xvmaddadp vs39, vs3, vs17\r
-\r
- xvmaddadp vs40, vs0, vs18\r
- xvmaddadp vs41, vs0, vs19\r
- xvmaddadp vs42, vs1, vs18\r
- xvmaddadp vs43, vs1, vs19\r
- xvmaddadp vs44, vs2, vs18\r
- xvmaddadp vs45, vs2, vs19\r
- xvmaddadp vs46, vs3, vs18\r
- xvmaddadp vs47, vs3, vs19\r
-\r
-.if \Complete==0\r
- lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
-.endif\r
.if \Complete==0 \r
- lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
\r
+.if \Complete==0 \r
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
.endif\r
-.if \Complete==0\r
- lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
- xxswapd vs17,vs16\r
-.endif\r
-.if \IsLast==1\r
-.if \Complete==1\r
- addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)\r
- addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
-.else\r
- addi \AREG, \AREG, DISP8(\Index,128)\r
- addi \BREG, \BREG, DISP2(\Index,32)\r
-.endif\r
-.endif\r
-\r
xvmaddadp vs32, vs8, vs20\r
xvmaddadp vs33, vs8, vs21\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+.endif\r
xvmaddadp vs34, vs9, vs20\r
xvmaddadp vs35, vs9, vs21\r
+.if \Complete==0 \r
+ lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
xvmaddadp vs36, vs10, vs20\r
xvmaddadp vs37, vs10, vs21\r
xvmaddadp vs38, vs11, vs20\r
xvmaddadp vs39, vs11, vs21\r
-\r
- xvmaddadp vs40, vs8, vs22\r
- xvmaddadp vs41, vs8, vs23\r
- xvmaddadp vs42, vs9, vs22\r
- xvmaddadp vs43, vs9, vs23\r
- xvmaddadp vs44, vs10, vs22\r
- xvmaddadp vs45, vs10, vs23\r
- xvmaddadp vs46, vs11, vs22\r
- xvmaddadp vs47, vs11, vs23\r
-\r
+.if \Complete==0 \r
+ lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+ \r
+.if \Complete==0 \r
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
+.endif\r
+.if \IsLast==1\r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP8(\Index,128)\r
+ addi \BREG, \BREG, DISP2(\Index,32)\r
+.endif\r
+.endif \r
.endm\r
+ \r
+\r
\r
.macro KERNEL1x4\r
- LOAD1x4 0\r
+ LOAD1x4\r
END1x4 AO, BO, 64,16\r
.endm\r
\r
-.macro SAVE1x4\r
+\r
+\r
+.macro SAVE1x4 \r
SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0\r
addi CO, CO, 64\r
-\r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=1 and M=2\r
+*\r
+\r
+.macros for N=2 and M=2\r
**********************************************************************************************/\r
\r
+\r
.macro Zero1x2\r
xxlxor vs32, vs32, vs32\r
xxlxor vs33, vs33, vs33\r
xxlxor vs34, vs34, vs34\r
- xxlxor vs35, vs35, vs35\r
+ xxlxor vs35, vs35, vs35 \r
+\r
.endm\r
\r
-.macro LOAD1x2 Zero\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- xxswapd vs17,vs16\r
- lxv vs0, 0(AO) // load real,imag from A\r
- lxv vs1, 16(AO) // load real,imag from A\r
- \r
-.if \Zero==1\r
- Zero1x2\r
-.endif\r
+.macro LOAD1x2 \r
+ LOAD1x2O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD1x2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ xxswapd vs17, vs16\r
+\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A \r
\r
.endm\r
\r
+\r
.macro END1x2_NORMAL\r
END1x2 AO,BO,32,16\r
.endm\r
\r
-.macro END1x2 AREG, BREG, OffsetA, OffsetB\r
\r
+.macro END1x2_WITHOUT_ADD\r
+ END1x2 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x2 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
.if \OffsetA != 0\r
addi \AREG, \AREG, \OffsetA\r
.endif\r
-\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs33, vs0, vs17\r
+\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs35, vs1, vs17\r
\r
.endm\r
\r
-.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+\r
+.macro LOAD1x2_2\r
+ LOAD1x2_2O 0,0\r
+.endm \r
+\r
+\r
+.macro LOAD1x2_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs1, (16+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (32+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs9, (48+\OffsetA)(AO) // load real,imag from A\r
+.endm \r
+\r
+\r
+.macro END1x2_2 \r
+ /*for load2 offset will be 64 and 32*/\r
+ KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 \r
.endm\r
+ \r
+\r
\r
-.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
\r
- lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- xxswapd vs21,vs20\r
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
\r
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
xvmaddadp vs32, vs0, vs16\r
xvmaddadp vs33, vs0, vs17\r
-\r
- lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
-\r
+ xxswapd vs21, vs20\r
xvmaddadp vs34, vs1, vs16\r
xvmaddadp vs35, vs1, vs17\r
-.if \Complete==0\r
- lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
- lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A \r
+.if \Complete==0 \r
+ lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif \r
+.if \Complete==0 \r
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
.endif\r
-.if \Complete==0\r
- lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
- xxswapd vs17,vs16\r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21\r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+.endif\r
+ xvmaddadp vs34, vs9, vs20\r
+ xvmaddadp vs35, vs9, vs21\r
+.if \Complete==0 \r
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
.endif\r
+.if \Complete==0 \r
+ lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A\r
+.endif\r
+ \r
+ \r
+\r
.if \IsLast==1\r
.if \Complete==1\r
- addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
- addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
.else\r
- addi \AREG, \AREG, DISP4(\Index,64)\r
+ addi \AREG, \AREG, DISP4(\Index,64)\r
addi \BREG, \BREG, DISP2(\Index,32)\r
.endif\r
-.endif\r
-\r
- xvmaddadp vs32, vs8, vs20\r
- xvmaddadp vs33, vs8, vs21\r
- xvmaddadp vs34, vs9, vs20\r
- xvmaddadp vs35, vs9, vs21\r
-\r
+.endif \r
.endm\r
+ \r
+\r
\r
.macro KERNEL1x2\r
- LOAD1x2 0\r
+ LOAD1x2\r
END1x2 AO, BO, 32,16\r
.endm\r
\r
-.macro SAVE1x2\r
+\r
+\r
+.macro SAVE1x2 \r
SAVE2 vs32,vs33,vs34,vs35,CO,0\r
addi CO, CO, 32 \r
.endm\r
-\r
/**********************************************************************************************\r
-* Macros for N=1 and M=1\r
+*\r
+\r
+.macros for N=2 and M=1\r
**********************************************************************************************/\r
\r
+\r
+\r
.macro Zero1x1\r
xxlxor vs32, vs32, vs32\r
- xxlxor vs33, vs33, vs33\r
+ xxlxor vs33, vs33, vs33 \r
.endm\r
\r
-.macro LOAD1x1 Zero\r
- lxv vs0, 0(AO) // load real,imag from A\r
\r
- lxv vs16, 0(BO) // load real imag from B\r
- xxswapd vs17, vs16\r
-.if \Zero==1\r
- Zero1x1\r
-.endif\r
- \r
+.macro LOAD1x1 \r
+ LOAD1x1O 0,0 \r
.endm\r
\r
+\r
+.macro LOAD1x1O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A \r
+ xxswapd vs17, vs16\r
+\r
+.endm\r
+\r
+\r
.macro END1x1_NORMAL\r
END1x1 AO,BO,16,16\r
.endm\r
\r
-.macro END1x1 AREG, BREG, OffsetA, OffsetB\r
\r
-.if \OffsetA != 0\r
- addi \AREG, \AREG, \OffsetA\r
-.endif\r
+.macro END1x1_WITHOUT_ADD\r
+ END1x1 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x1 AREG, BREG, OffsetA, OffsetB\r
.if \OffsetB != 0\r
addi \BREG, \BREG, \OffsetB\r
.endif\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+ xvmaddadp vs32, vs0, vs16 \r
+ xvmaddadp vs33, vs0, vs17 \r
+.endm\r
\r
- xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
\r
+.macro LOAD1x1_2\r
+ LOAD1x1_2O 0,0\r
+.endm \r
\r
-.endm\r
\r
-.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.macro LOAD1x1_2O OffsetA,OffsetB\r
+ lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B\r
+ lxv vs20, (\OffsetB+16)(BO) // load real,imag from B\r
+ xxswapd vs17, vs16\r
+\r
+ lxv vs0, (0+\OffsetA)(AO) // load real,imag from A\r
+ lxv vs8, (16+\OffsetA)(AO) // load real,imag from A \r
+.endm \r
+\r
+\r
+.macro END1x1_2 \r
+ /*for load2 offset will be 32 and 32*/\r
+ KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 \r
.endm\r
+ \r
\r
-.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast\r
- KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+\r
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
.endm\r
\r
-.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
- lxv vs20, DISP2(\Index, 0+\OffsetB)(\BREG) // load real,imag from B\r
- xxswapd vs21, vs20\r
\r
- lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
- \r
- xvmaddadp vs32, vs0, vs16\r
- xvmaddadp vs33, vs0, vs17\r
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
\r
-.if \Complete==0\r
- lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xxswapd vs21, vs20\r
+ xvmaddadp vs32, vs0, vs16 \r
+ xvmaddadp vs33, vs0, vs17 \r
+.if \Complete==0 \r
+ lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B\r
.endif\r
-.if \Complete==0\r
- lxv vs16, DISP2(\Index, 16+\OffsetB)(\BREG) // load real imag from B\r
- xxswapd vs17, vs16 \r
+.if \Complete==0 \r
+ xxswapd vs17, vs16\r
+.endif \r
+ xvmaddadp vs32, vs8, vs20\r
+ xvmaddadp vs33, vs8, vs21 \r
+.if \Complete==0 \r
+ lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A \r
+.endif\r
+ \r
+.if \Complete==0 \r
+ lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B\r
.endif\r
-\r
.if \IsLast==1\r
.if \Complete==1\r
- addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
- addi \BREG, \BREG, DISP2(\Index,16+\OffsetB)\r
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA)\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
.else\r
- addi \AREG, \AREG, DISP2(\Index,32)\r
+ addi \AREG, \AREG, DISP2(\Index,32)\r
addi \BREG, \BREG, DISP2(\Index,32)\r
.endif\r
-.endif\r
-\r
- xvmaddadp vs32, vs8, vs20\r
- xvmaddadp vs33, vs8, vs21\r
-\r
-\r
+.endif \r
.endm\r
+ \r
+\r
\r
.macro KERNEL1x1\r
- LOAD1x1 0\r
+ LOAD1x1\r
END1x1 AO, BO, 16,16\r
-\r
.endm\r
\r
-.macro SAVE1x1 \r
+\r
+\r
+.macro SAVE1x1\r
SAVE1 vs32,vs33,CO,0\r
addi CO, CO, 16 \r
.endm\r
\r
+/****************************TRMM POINTER REFRESH\r
+\r
+.macroSES*************************/\r
+\r
+\r
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL\r
+ .if \SHIFT_VAL==16 \r
+ slwi \REG1, \REG2, 8 \r
+ .elseif \SHIFT_VAL==8 \r
+ slwi \REG1, \REG2, 7 \r
+ .elseif \SHIFT_VAL==4\r
+ slwi \REG1, \REG2, 6 \r
+ .elseif \SHIFT_VAL==2\r
+ slwi \REG1, \REG2, 5 \r
+ .elseif \SHIFT_VAL==1\r
+ slwi \REG1, \REG2, 4 \r
+ .endif\r
+.endm\r
+/*\r
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+// ptrbb = bb;\r
+// #else\r
+// ptrba += off*16;\r
+// ptrbb = bb + off*2;\r
+// #endif\r
+*/\r
+\r
+\r
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B\r
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+ /* ptrbb = bb;*/\r
+ mr \PTR_B,\B_VAL /* refresh BPOINT */\r
+ #else\r
+ /*\r
+ // ptrba =ptrba+ off*C_A;\r
+ // ptrbb = bb + off*C_B; \r
+ */\r
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */\r
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */\r
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */\r
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */\r
+ #endif \r
+.endm\r
+\r
+/*\r
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+// temp = bk-off;\r
+// #elif defined(LEFT)\r
+// temp = off+16; // number of values in A\r
+// #else\r
+// temp = off+2; // number of values in B\r
+// #endif\r
+*/\r
+\r
+\r
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B\r
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ /* temp = bk-off;*/\r
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL\r
+ #elif defined(LEFT)\r
+ /* temp = off+INCR_A; // number of values in A */\r
+ addi \TEMP_BK, \OFF_VAL, \INCR_A\r
+ #else\r
+ /* temp = off+INCR_B // number of values in B*/\r
+ addi \TEMP_BK,\OFF_VAL, \INCR_B\r
+ #endif\r
+.endm\r
+/*\r
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+// temp = bk - off;\r
+// #ifdef LEFT\r
+// temp -= 16; // number of values in A\r
+// #else\r
+// temp -= 2; // number of values in B\r
+// #endif\r
+// ptrba += temp*16;\r
+// ptrbb += temp*2;\r
+// #endif\r
+// #ifdef LEFT\r
+// off += 16; // number of values in A\r
+// #endif\r
+*/\r
+ \r
+\r
+\r
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B\r
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+ /*temp = bk - off;*/\r
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL\r
+ #ifdef LEFT\r
+ /*temp -= 8; // number of values in A*/\r
+ addi \TEMP_BK,\TEMP_BK,-\C_A\r
+ #else\r
+ /*temp -= 4; // number of values in B*/\r
+ addi \TEMP_BK,\TEMP_BK,-\C_B \r
+ #endif\r
+ /*ptrba += temp*C_A;\r
+ ptrbb += temp*C_B;*/ \r
+ SHIFT_REG T4,\TEMP_BK,\C_A\r
+ SHIFT_REG T2,\TEMP_BK,\C_B\r
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ \r
+ add \PTR_B, \PTR_B,T2 \r
+ #endif\r
+ #ifdef LEFT\r
+ /*off += 8; // number of values in A*/\r
+ addi \OFF_VAL,\OFF_VAL,\C_A\r
+ #endif\r
+.endm
\ No newline at end of file