*******************************************************************************************/\r
\r
#define KERNEL16x3_1(xx) \\r
- prefetcht0 A_PR1(AO,%rax,SIZE) ;\\r
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\\r
+ prefetcht0 A_PR1(AO,%rax,SIZE) ;\\r
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\\r
vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\\r
vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\\r
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\\r
+ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
+ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\\r
\r
#define KERNEL16x3_2(xx) \\r
- prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\\r
- vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\\r
- vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\\r
+ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\\r
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\\r
vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\\r
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\\r
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\\r
+ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
+ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\\r
\r
#define KERNEL16x3_3(xx) \\r
- prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\\r
- vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\\r
- vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\\r
+ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\\r
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\\r
vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\\r
vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\\r
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\\r
+ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
+ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\\r
\r
#define KERNEL16x3_4(xx) \\r
- prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\\r
- vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\\r
- vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\\r
+ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\\r
vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\\r
vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\\r
vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\\r
+ addq $12, BI ;\\r
vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\\r
vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\\r
vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\\r
- vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\\r
- addq $12, BI ;\\r
addq $64, %rax ;\\r
+ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\\r
\r
#define KERNEL16x3_SUB(xx) \\r
vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\\r
vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\\r
vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\\r
+ nop ;\\r
vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\\r
vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\\r
vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\\r
leaq (B,%rax, SIZE), BO2 // next offset to BO2\r
leaq BUFFER1, BO // first buffer to BO\r
movq K, %rax\r
+ sarq $3 , %rax // K / 8\r
+ jz .L6_01a_2\r
+ ALIGN_4\r
+\r
+.L6_01a_1:\r
+\r
+ prefetcht0 512(BO1)\r
+ prefetcht0 512(BO2)\r
+ prefetchw 512(BO)\r
+\r
+ vmovsd 0 * SIZE(BO1), %xmm0\r
+ vmovsd 2 * SIZE(BO1), %xmm2\r
+ vmovsd 4 * SIZE(BO1), %xmm4\r
+ vmovsd 6 * SIZE(BO1), %xmm6\r
+ vmovss 0 * SIZE(BO2), %xmm1\r
+ vmovss 2 * SIZE(BO2), %xmm3\r
+ vmovss 4 * SIZE(BO2), %xmm5\r
+ vmovss 6 * SIZE(BO2), %xmm7\r
+ vmovsd %xmm0, 0*SIZE(BO)\r
+ vmovss %xmm1, 2*SIZE(BO)\r
+ vmovsd %xmm2, 3*SIZE(BO)\r
+ vmovss %xmm3, 5*SIZE(BO)\r
+ vmovsd %xmm4, 6*SIZE(BO)\r
+ vmovss %xmm5, 8*SIZE(BO)\r
+ vmovsd %xmm6, 9*SIZE(BO)\r
+ vmovss %xmm7,11*SIZE(BO)\r
+ addq $8*SIZE,BO1\r
+ addq $8*SIZE,BO2\r
+ addq $12*SIZE,BO\r
+\r
+ vmovsd 0 * SIZE(BO1), %xmm0\r
+ vmovsd 2 * SIZE(BO1), %xmm2\r
+ vmovsd 4 * SIZE(BO1), %xmm4\r
+ vmovsd 6 * SIZE(BO1), %xmm6\r
+ vmovss 0 * SIZE(BO2), %xmm1\r
+ vmovss 2 * SIZE(BO2), %xmm3\r
+ vmovss 4 * SIZE(BO2), %xmm5\r
+ vmovss 6 * SIZE(BO2), %xmm7\r
+ vmovsd %xmm0, 0*SIZE(BO)\r
+ vmovss %xmm1, 2*SIZE(BO)\r
+ vmovsd %xmm2, 3*SIZE(BO)\r
+ vmovss %xmm3, 5*SIZE(BO)\r
+ vmovsd %xmm4, 6*SIZE(BO)\r
+ vmovss %xmm5, 8*SIZE(BO)\r
+ vmovsd %xmm6, 9*SIZE(BO)\r
+ vmovss %xmm7,11*SIZE(BO)\r
+ addq $8*SIZE,BO1\r
+ addq $8*SIZE,BO2\r
+ addq $12*SIZE,BO\r
+\r
+ decq %rax\r
+ jnz .L6_01a_1\r
+\r
+\r
+\r
+.L6_01a_2:\r
+\r
+ movq K, %rax\r
+ andq $7, %rax // K % 8\r
+ jz .L6_02c\r
ALIGN_4\r
\r
+\r
.L6_02b:\r
\r
- vmovss 0 * SIZE(BO1), %xmm0\r
- vmovss 1 * SIZE(BO1), %xmm1\r
+ vmovsd 0 * SIZE(BO1), %xmm0\r
vmovss 0 * SIZE(BO2), %xmm2\r
- vmovss %xmm0, 0*SIZE(BO)\r
- vmovss %xmm1, 1*SIZE(BO)\r
+ vmovsd %xmm0, 0*SIZE(BO)\r
vmovss %xmm2, 2*SIZE(BO)\r
addq $2*SIZE,BO1\r
addq $2*SIZE,BO2\r
leaq (BO1,%rax, SIZE), BO2 // next offset to BO2\r
leaq BUFFER2, BO // second buffer to BO\r
movq K, %rax\r
+ sarq $3 , %rax // K / 8\r
+ jz .L6_02c_2\r
ALIGN_4\r
\r
+.L6_02c_1:\r
+\r
+ prefetcht0 512(BO2)\r
+ prefetchw 512(BO)\r
+\r
+ vmovsd 0 * SIZE(BO2), %xmm0\r
+ vmovsd 2 * SIZE(BO2), %xmm2\r
+ vmovsd 4 * SIZE(BO2), %xmm4\r
+ vmovsd 6 * SIZE(BO2), %xmm6\r
+ vmovss 1 * SIZE(BO1), %xmm1\r
+ vmovss 3 * SIZE(BO1), %xmm3\r
+ vmovss 5 * SIZE(BO1), %xmm5\r
+ vmovss 7 * SIZE(BO1), %xmm7\r
+ vmovss %xmm1, 0*SIZE(BO)\r
+ vmovsd %xmm0, 1*SIZE(BO)\r
+ vmovss %xmm3, 3*SIZE(BO)\r
+ vmovsd %xmm2, 4*SIZE(BO)\r
+ vmovss %xmm5, 6*SIZE(BO)\r
+ vmovsd %xmm4, 7*SIZE(BO)\r
+ vmovss %xmm7, 9*SIZE(BO)\r
+ vmovsd %xmm6,10*SIZE(BO)\r
+ addq $8*SIZE,BO1\r
+ addq $8*SIZE,BO2\r
+ addq $12*SIZE,BO\r
+\r
+\r
+ vmovsd 0 * SIZE(BO2), %xmm0\r
+ vmovsd 2 * SIZE(BO2), %xmm2\r
+ vmovsd 4 * SIZE(BO2), %xmm4\r
+ vmovsd 6 * SIZE(BO2), %xmm6\r
+ vmovss 1 * SIZE(BO1), %xmm1\r
+ vmovss 3 * SIZE(BO1), %xmm3\r
+ vmovss 5 * SIZE(BO1), %xmm5\r
+ vmovss 7 * SIZE(BO1), %xmm7\r
+ vmovss %xmm1, 0*SIZE(BO)\r
+ vmovsd %xmm0, 1*SIZE(BO)\r
+ vmovss %xmm3, 3*SIZE(BO)\r
+ vmovsd %xmm2, 4*SIZE(BO)\r
+ vmovss %xmm5, 6*SIZE(BO)\r
+ vmovsd %xmm4, 7*SIZE(BO)\r
+ vmovss %xmm7, 9*SIZE(BO)\r
+ vmovsd %xmm6,10*SIZE(BO)\r
+ addq $8*SIZE,BO1\r
+ addq $8*SIZE,BO2\r
+ addq $12*SIZE,BO\r
+\r
+ decq %rax\r
+ jnz .L6_02c_1\r
+\r
+\r
+.L6_02c_2:\r
+\r
+ movq K, %rax\r
+ andq $7, %rax // K % 8\r
+ jz .L6_03c\r
+ ALIGN_4\r
\r
.L6_03b:\r
\r
vmovss 1*SIZE(BO1), %xmm0\r
- vmovss 0*SIZE(BO2), %xmm1\r
- vmovss 1*SIZE(BO2), %xmm2\r
+ vmovsd 0*SIZE(BO2), %xmm1\r
vmovss %xmm0, 0*SIZE(BO)\r
- vmovss %xmm1, 1*SIZE(BO)\r
- vmovss %xmm2, 2*SIZE(BO)\r
+ vmovsd %xmm1, 1*SIZE(BO)\r
addq $2*SIZE,BO1\r
addq $2*SIZE,BO2\r
addq $3*SIZE,BO\r