/*********************************************************************/\r
\r
/*********************************************************************\r
-* 2013/10/19 Saar\r
-* BLASTEST : \r
+* 2013/10/20 Saar\r
+* BLASTEST : OK\r
* CTEST : OK\r
* TEST : OK\r
\r
*\r
*\r
-* 2013/08/15 Saar\r
+* 2013/10/20 Saar\r
* Parameter:\r
-* SGEMM_DEFAULT_UNROLL_N 2\r
-* SGEMM_DEFAULT_UNROLL_M 16\r
-* SGEMM_DEFAULT_P 384\r
-* SGEMM_DEFAULT_Q 168\r
+* DGEMM_DEFAULT_UNROLL_N 2\r
+* DGEMM_DEFAULT_UNROLL_M 16\r
+* DGEMM_DEFAULT_P 192\r
+* DGEMM_DEFAULT_Q 128\r
+* A_PR1 512\r
*\r
-* BLASTEST: OK\r
-*\r
-* Performance:\r
-* 1 thread: 2.31 times faster than sandybridge\r
-* 4 threads: 2.26 times faster than sandybridge\r
-*\r
-* Compile for FMA3: OK\r
*\r
+* Performance without prefetch of B:\r
+* 1 thread: 45.8 GFLOPS (MKL: 45)\r
+* 2 threads: 80.0 GFLOPS (MKL: 91)\r
+* 4 threads: 135.0 GFLOPS (MKL: 135)\r
*********************************************************************/\r
\r
\r
#endif\r
\r
\r
-#define A_PR1 384\r
-#define B_PR1 192\r
+#define A_PR1 512\r
+#define B_PR1 256\r
\r
/*******************************************************************************************\r
* 3 lines of N\r
*******************************************************************************************/\r
\r
+.macro KERNEL16x3_SUBN\r
+ prefetcht0 A_PR1(AO)\r
+ vbroadcastsd -12 * SIZE(BO), %ymm1\r
+ vmovaps -16 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm4,%ymm1,%ymm0\r
+ vbroadcastsd -11 * SIZE(BO), %ymm2\r
+ VFMADD231PD_ %ymm5,%ymm2,%ymm0\r
+ vbroadcastsd -10 * SIZE(BO), %ymm3\r
+ VFMADD231PD_ %ymm6,%ymm3,%ymm0\r
+ vmovaps -12 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm7,%ymm1,%ymm0\r
+ prefetcht0 A_PR1+64(AO)\r
+ VFMADD231PD_ %ymm8,%ymm2,%ymm0\r
+ VFMADD231PD_ %ymm9,%ymm3,%ymm0\r
+ vmovaps -8 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm10,%ymm1,%ymm0\r
+ VFMADD231PD_ %ymm11,%ymm2,%ymm0\r
+ VFMADD231PD_ %ymm12,%ymm3,%ymm0\r
+ vmovaps -4 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm13,%ymm1,%ymm0\r
+ VFMADD231PD_ %ymm14,%ymm2,%ymm0\r
+ VFMADD231PD_ %ymm15,%ymm3,%ymm0\r
+ addq $3*SIZE , BO \r
+ addq $16*SIZE, AO\r
+.endm\r
+\r
+\r
+.macro KERNEL8x3_SUBN\r
+ //prefetcht0 A_PR1(AO)\r
+ vbroadcastsd -12 * SIZE(BO), %ymm1\r
+ vmovaps -16 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm4,%ymm1,%ymm0\r
+ vbroadcastsd -11 * SIZE(BO), %ymm2\r
+ VFMADD231PD_ %ymm5,%ymm2,%ymm0\r
+ vbroadcastsd -10 * SIZE(BO), %ymm3\r
+ VFMADD231PD_ %ymm6,%ymm3,%ymm0\r
+ vmovaps -12 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm7,%ymm1,%ymm0\r
+ //prefetcht0 A_PR1+64(AO)\r
+ VFMADD231PD_ %ymm8,%ymm2,%ymm0\r
+ VFMADD231PD_ %ymm9,%ymm3,%ymm0\r
+ prefetcht0 B_PR1(BO)\r
+ addq $3*SIZE , BO \r
+ addq $8*SIZE, AO\r
+.endm\r
+\r
+.macro KERNEL4x3_SUBN\r
+ vbroadcastsd -12 * SIZE(BO), %ymm1\r
+ vmovaps -16 * SIZE(AO), %ymm0\r
+ VFMADD231PD_ %ymm4,%ymm1,%ymm0\r
+ vbroadcastsd -11 * SIZE(BO), %ymm2\r
+ VFMADD231PD_ %ymm5,%ymm2,%ymm0\r
+ vbroadcastsd -10 * SIZE(BO), %ymm3\r
+ VFMADD231PD_ %ymm6,%ymm3,%ymm0\r
+ addq $3*SIZE , BO \r
+ addq $4*SIZE, AO\r
+.endm\r
+\r
+.macro KERNEL2x3_SUBN\r
+ vmovsd -12 * SIZE(BO), %xmm1\r
+ vmovsd -16 * SIZE(AO), %xmm0\r
+ VFMADD231SD_ %xmm4,%xmm1,%xmm0\r
+ vmovsd -11 * SIZE(BO), %xmm2\r
+ VFMADD231SD_ %xmm5,%xmm2,%xmm0\r
+ vmovsd -10 * SIZE(BO), %xmm3\r
+ VFMADD231SD_ %xmm6,%xmm3,%xmm0\r
+ vmovsd -15 * SIZE(AO), %xmm0\r
+ VFMADD231SD_ %xmm8,%xmm1,%xmm0\r
+ VFMADD231SD_ %xmm10,%xmm2,%xmm0\r
+ VFMADD231SD_ %xmm12,%xmm3,%xmm0\r
+ addq $3*SIZE , BO \r
+ addq $2*SIZE, AO\r
+.endm\r
+\r
+.macro KERNEL1x3_SUBN\r
+ vmovsd -12 * SIZE(BO), %xmm1\r
+ vmovsd -16 * SIZE(AO), %xmm0\r
+ VFMADD231SD_ %xmm4,%xmm1,%xmm0\r
+ vmovsd -11 * SIZE(BO), %xmm2\r
+ VFMADD231SD_ %xmm5,%xmm2,%xmm0\r
+ vmovsd -10 * SIZE(BO), %xmm3\r
+ VFMADD231SD_ %xmm6,%xmm3,%xmm0\r
+ addq $3*SIZE , BO \r
+ addq $1*SIZE, AO\r
+.endm\r
+\r
+\r
+\r
+\r
+\r
+\r
+/******************************************************************************************/\r
+\r
.macro KERNEL16x3_1\r
prefetcht0 A_PR1(AO, %rax, SIZE)\r
vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1\r
\r
\r
movq A, AO // aoffset = a\r
- addq $32 * SIZE, AO\r
+ addq $16 * SIZE, AO\r
\r
movq M, I\r
sarq $4, I // i = (m >> 4)\r
\r
.L6_11:\r
leaq BUFFER1, BO // first buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
+\r
+ prefetcht0 (CO1)\r
+ prefetcht0 (CO1,LDC,1)\r
+ prefetcht0 (CO1,LDC,2)\r
+ prefetcht0 64(CO1)\r
+ prefetcht0 64(CO1,LDC,1)\r
+ prefetcht0 64(CO1,LDC,2)\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax // K = K - ( K % 8 )\r
+ sarq $1, %rax // K / 8\r
je .L6_16\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $4, %rax // rax = rax * 16 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
- ALIGN_4\r
+ ALIGN_5\r
\r
.L6_12:\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- KERNEL16x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL16x3_4\r
-\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL16x3_3\r
- KERNEL16x3_4\r
-\r
- je .L6_16\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- KERNEL16x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL16x3_4\r
-\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL16x3_3\r
- KERNEL16x3_4\r
-\r
- je .L6_16\r
-\r
- jmp .L6_12\r
- ALIGN_4\r
+/*\r
+ prefetcht0 B_PR1(BO)\r
+ prefetcht0 B_PR1+64(BO)\r
+ prefetcht0 B_PR1+128(BO)\r
+*/\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+/*\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+*/\r
+ dec %rax\r
+ jne .L6_12\r
\r
.L6_16:\r
movq K, %rax\r
\r
- andq $7, %rax # if (k & 1)\r
+ andq $1, %rax # if (k & 1)\r
je .L6_19\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $4, %rax // rax = rax * 16 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_17:\r
\r
- KERNEL16x3_SUB\r
+ KERNEL16x3_SUBN\r
\r
- jl .L6_17\r
+ dec %rax\r
+ jne .L6_17\r
ALIGN_4\r
\r
\r
\r
.L6_20_1:\r
leaq BUFFER1, BO // first buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L6_20_6\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $3, %rax // rax = rax * 8 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_20_2:\r
\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- KERNEL8x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL8x3_4\r
-\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL8x3_3\r
- KERNEL8x3_4\r
-\r
- je .L6_20_6\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- KERNEL8x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL8x3_4\r
-\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL8x3_3\r
- KERNEL8x3_4\r
-\r
- je .L6_20_6\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
\r
- jmp .L6_20_2\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ dec %rax\r
+ jne .L6_20_2\r
ALIGN_4\r
\r
.L6_20_6:\r
andq $7, %rax # if (k & 1)\r
je .L6_20_9\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $3, %rax // rax = rax * 8 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_20_7:\r
\r
- KERNEL8x3_SUB\r
+ KERNEL8x3_SUBN\r
\r
- jl .L6_20_7\r
+ dec %rax\r
+ jne .L6_20_7\r
ALIGN_4\r
\r
\r
\r
.L6_21:\r
leaq BUFFER1, BO // first buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L6_26\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $2, %rax // rax = rax * 4 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_22:\r
\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- KERNEL4x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL4x3_4\r
-\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL4x3_3\r
- KERNEL4x3_4\r
-\r
- je .L6_26\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- KERNEL4x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL4x3_4\r
-\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL4x3_3\r
- KERNEL4x3_4\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
\r
- je .L6_26\r
-\r
- jmp .L6_22\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ dec %rax\r
+ jne .L6_22\r
ALIGN_4\r
\r
.L6_26:\r
andq $7, %rax # if (k & 1)\r
je .L6_29\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $2, %rax // rax = rax * 4 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_27:\r
\r
- KERNEL4x3_SUB\r
+ KERNEL4x3_SUBN\r
\r
- jl .L6_27\r
+ dec %rax\r
+ jne .L6_27\r
ALIGN_4\r
\r
\r
\r
.L6_31:\r
leaq BUFFER1, BO // first buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L6_36\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $1, %rax // rax = rax *2 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_32:\r
\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
-\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
-\r
- je .L6_36\r
-\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
-\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
-\r
- je .L6_36\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
\r
- jmp .L6_32\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ dec %rax\r
+ jne .L6_32\r
ALIGN_4\r
\r
.L6_36:\r
andq $7, %rax # if (k & 1)\r
je .L6_39\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
- \r
- salq $1, %rax // rax = rax *2 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_37:\r
\r
- KERNEL2x3_SUB\r
+ KERNEL2x3_SUBN\r
\r
- jl .L6_37\r
+ dec %rax\r
+ jne .L6_37\r
ALIGN_4\r
\r
\r
\r
.L6_41:\r
leaq BUFFER1, BO // first buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3,%rax\r
je .L6_46\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_42:\r
\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
-\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
-\r
- je .L6_46\r
-\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
\r
- je .L6_46\r
-\r
- jmp .L6_42\r
+ dec %rax\r
+ jne .L6_42\r
ALIGN_4\r
\r
.L6_46:\r
andq $7, %rax # if (k & 1)\r
je .L6_49\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L6_47:\r
\r
- KERNEL1x3_SUB\r
+ KERNEL1x3_SUBN\r
\r
- jl .L6_47\r
+ dec %rax\r
+ jne .L6_47\r
ALIGN_4\r
\r
\r
\r
\r
movq A, AO // aoffset = a\r
- addq $32 * SIZE, AO\r
+ addq $16 * SIZE, AO\r
\r
movq M, I\r
sarq $4, I // i = (m >> 4)\r
\r
.L7_11:\r
leaq BUFFER2, BO // second buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
+\r
+ prefetcht0 (CO1)\r
+ prefetcht0 (CO1,LDC,1)\r
+ prefetcht0 (CO1,LDC,2)\r
+ prefetcht0 64(CO1)\r
+ prefetcht0 64(CO1,LDC,1)\r
+ prefetcht0 64(CO1,LDC,2)\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax // K = K - ( K % 8 )\r
+ sarq $3, %rax // K / 8\r
je .L7_16\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $4, %rax // rax = rax * 16 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
- ALIGN_4\r
+ ALIGN_5\r
\r
.L7_12:\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- KERNEL16x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL16x3_4\r
-\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL16x3_3\r
- KERNEL16x3_4\r
-\r
- je .L7_16\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- KERNEL16x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL16x3_4\r
-\r
- KERNEL16x3_1\r
- KERNEL16x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL16x3_3\r
- KERNEL16x3_4\r
-\r
- je .L7_16\r
-\r
- jmp .L7_12\r
+/*\r
+ prefetcht0 B_PR1(BO)\r
+ prefetcht0 B_PR1+64(BO)\r
+ prefetcht0 B_PR1+128(BO)\r
+*/\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ KERNEL16x3_SUBN\r
+ dec %rax\r
+ jne .L7_12\r
ALIGN_4\r
\r
.L7_16:\r
andq $7, %rax # if (k & 1)\r
je .L7_19\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $4, %rax // rax = rax * 16 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
- ALIGN_4\r
+ ALIGN_5\r
\r
.L7_17:\r
\r
- KERNEL16x3_SUB\r
+ KERNEL16x3_SUBN\r
\r
- jl .L7_17\r
- ALIGN_4\r
+ dec %rax\r
+ jne .L7_17\r
\r
\r
.L7_19:\r
\r
.L7_20_1:\r
leaq BUFFER2, BO // first buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L7_20_6\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $3, %rax // rax = rax * 8 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_20_2:\r
\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- KERNEL8x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL8x3_4\r
-\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL8x3_3\r
- KERNEL8x3_4\r
-\r
- je .L7_20_6\r
-\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- KERNEL8x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL8x3_4\r
-\r
- KERNEL8x3_1\r
- KERNEL8x3_2\r
- prefetcht0 B_PR1+128(BO,BI,8)\r
- KERNEL8x3_3\r
- KERNEL8x3_4\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
\r
- je .L7_20_6\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
+ KERNEL8x3_SUBN\r
\r
- jmp .L7_20_2\r
+ dec %rax\r
+ jne .L7_20_2\r
ALIGN_4\r
\r
.L7_20_6:\r
andq $7, %rax # if (k & 1)\r
je .L7_20_9\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $3, %rax // rax = rax * 8 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_20_7:\r
\r
- KERNEL8x3_SUB\r
+ KERNEL8x3_SUBN\r
\r
- jl .L7_20_7\r
+ dec %rax\r
+ jne .L7_20_7\r
ALIGN_4\r
\r
.L7_20_9:\r
\r
.L7_21:\r
leaq BUFFER2, BO // second buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L7_26\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $2, %rax // rax = rax * 4 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_22:\r
\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- KERNEL4x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL4x3_4\r
-\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL4x3_3\r
- KERNEL4x3_4\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
\r
- je .L7_26\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
+ KERNEL4x3_SUBN\r
\r
- prefetcht0 B_PR1(BO,BI,8)\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- KERNEL4x3_3\r
- prefetcht0 B_PR1+64(BO,BI,8)\r
- KERNEL4x3_4\r
-\r
- KERNEL4x3_1\r
- KERNEL4x3_2\r
- prefetcht0 B_PR1+32(BO,BI,8)\r
- KERNEL4x3_3\r
- KERNEL4x3_4\r
-\r
- je .L7_26\r
-\r
- jmp .L7_22\r
+ dec %rax\r
+ jne .L7_22\r
ALIGN_4\r
\r
.L7_26:\r
andq $7, %rax # if (k & 1)\r
je .L7_29\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- salq $2, %rax // rax = rax * 4 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_27:\r
\r
- KERNEL4x3_SUB\r
+ KERNEL4x3_SUBN\r
\r
- jl .L7_27\r
+ dec %rax\r
+ jne .L7_27\r
ALIGN_4\r
\r
\r
\r
.L7_31:\r
leaq BUFFER2, BO // second buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L7_36\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- salq $1, %rax // rax = rax *2 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_32:\r
\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
-\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
-\r
- je .L7_36\r
-\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
\r
- KERNEL2x3_1\r
- KERNEL2x3_2\r
- KERNEL2x3_3\r
- KERNEL2x3_4\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
+ KERNEL2x3_SUBN\r
\r
- je .L7_36\r
-\r
- jmp .L7_32\r
+ dec %rax\r
+ jne .L7_32\r
ALIGN_4\r
\r
.L7_36:\r
andq $7, %rax # if (k & 1)\r
je .L7_39\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
- \r
- salq $1, %rax // rax = rax *2 ; number of values\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_37:\r
\r
- KERNEL2x3_SUB\r
+ KERNEL2x3_SUBN\r
\r
- jl .L7_37\r
+ dec %rax\r
+ jne .L7_37\r
ALIGN_4\r
\r
\r
\r
.L7_41:\r
leaq BUFFER2, BO // second buffer to BO\r
- addq $6 * SIZE, BO\r
+ addq $12 * SIZE, BO\r
\r
vzeroall\r
\r
movq K, %rax\r
\r
- andq $-8, %rax\r
+ sarq $3, %rax\r
je .L7_46\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_42:\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
-\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
-\r
- je .L7_46\r
-\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
-\r
- KERNEL1x3_1\r
- KERNEL1x3_2\r
- KERNEL1x3_3\r
- KERNEL1x3_4\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
+ KERNEL1x3_SUBN\r
\r
- je .L7_46\r
-\r
- jmp .L7_42\r
+ dec %rax\r
+ jne .L7_42\r
ALIGN_4\r
\r
.L7_46:\r
andq $7, %rax # if (k & 1)\r
je .L7_49\r
\r
- movq %rax, BI // Index for BO\r
- leaq (BI,BI,2), BI // BI = BI * 3 ; number of values\r
-\r
- leaq (AO, %rax, SIZE), AO\r
- leaq (BO, BI, SIZE), BO\r
- negq BI\r
- negq %rax\r
ALIGN_4\r
\r
.L7_47:\r
\r
- KERNEL1x3_SUB\r
+ KERNEL1x3_SUBN\r
\r
- jl .L7_47\r
+ dec %rax\r
+ jne .L7_47\r
ALIGN_4\r
\r
\r