vmulpd %ymm0 , %ymm9 , %ymm9\r
vmulpd %ymm0 , %ymm10, %ymm10\r
vmulpd %ymm0 , %ymm11, %ymm11\r
+#if B_PR1 >= 96\r
prefetcht0 128 + BUFFER1\r
+#endif\r
vmulpd %ymm0 , %ymm12, %ymm12\r
vmulpd %ymm0 , %ymm13, %ymm13\r
vmulpd %ymm0 , %ymm14, %ymm14\r
vmulpd %ymm0 , %ymm15, %ymm15\r
+#if B_PR1 >= 160\r
prefetcht0 192 + BUFFER1\r
+#endif\r
vpermilpd $ 0x05 , %ymm5, %ymm5\r
vpermilpd $ 0x05 , %ymm7, %ymm7\r
-\r
+#if B_PR1 >= 224\r
+ prefetcht0 256 + BUFFER1\r
+#endif\r
vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0\r
vblendpd $ 0x05, %ymm5, %ymm4, %ymm1\r
vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2\r
vblendpd $ 0x05, %ymm7, %ymm6, %ymm3\r
-\r
+#if B_PR1 >= 288\r
+ prefetcht0 320 + BUFFER1\r
+#endif\r
vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2\r
vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3\r
-\r
+#if B_PR1 >= 352\r
+ prefetcht0 384 + BUFFER1\r
+#endif\r
vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4\r
vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5\r
vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6\r
vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7\r
-\r
+#if B_PR1 >= 416\r
+ prefetcht0 448 + BUFFER1\r
+#endif\r
leaq (CO1, LDC, 2), %rax \r
\r
+#if B_PR1 >= 480\r
+ prefetcht0 512 + BUFFER1\r
+#endif\r
\r
#if !defined(TRMMKERNEL)\r
\r
\r
/* here for the prefetch of next b source block */\r
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */\r
- /* currently an increment of 128 byte is suitable */\r
+\r
salq $3, K\r
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */\r
+ prefetcht2 32(B)\r
+ prefetcht2 32(B, K, 8)\r
+ addq $64, B /* increment */\r
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */\r
prefetcht2 32(B)\r
prefetcht2 32(B, K, 8)\r
prefetcht2 96(B)\r
prefetcht2 96(B, K, 8)\r
addq $128, B /* increment */\r
+#endif\r
sarq $3, K\r
\r
decq I # i --\r
/**************************************************************************\r
* Rest of M \r
***************************************************************************/\r
- /* recover the original value of pointer B */\r
+\r
+ /* recover the original value of pointer B after prefetch */\r
movq M, I\r
sarq $2, I\r
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */\r
+ salq $6, I\r
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */\r
salq $7, I\r
+#endif\r
subq I, B\r
\r
.L12_20:\r
\r
/* here for the prefetch of next b source block */\r
/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */\r
- /* currently an increment of 128 byte is suitable */\r
+\r
salq $3, K\r
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */\r
+ prefetcht2 (B)\r
+ prefetcht2 (B, K, 8)\r
+ addq $64, B\r
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */\r
prefetcht2 (B)\r
prefetcht2 (B, K, 8)\r
prefetcht2 64(B)\r
prefetcht2 64(B, K, 8)\r
addq $128, B\r
+#endif\r
sarq $3, K\r
\r
decq I # i --\r
/* recover the original value of pointer B */\r
movq M, I\r
sarq $2, I\r
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */\r
+ salq $6, I\r
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */\r
salq $7, I\r
+#endif\r
subq I, B\r
\r
.L13_20:\r