optimized dgemm_kernel for HASWELL
authorwernsaar <wernsaar@googlemail.com>
Sun, 20 Oct 2013 14:52:26 +0000 (16:52 +0200)
committerwernsaar <wernsaar@googlemail.com>
Sun, 20 Oct 2013 14:52:26 +0000 (16:52 +0200)
driver/level3/level3.c
driver/level3/level3_thread.c
kernel/x86_64/dgemm_kernel_16x2_haswell.S
param.h

index 2fe8895..959c7f1 100644 (file)
@@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
        min_jj = min_j + js - jjs;
 
-#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
+#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
         if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
         else
                 if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
index 3242790..cd99172 100644 (file)
@@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
        min_jj = MIN(n_to, xxx + div_n) - jjs;
 
-#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
+#if defined(HASWELL) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
        if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
        else
                if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
index e015bbd..2907a68 100644 (file)
 /*********************************************************************/\r
 \r
 /*********************************************************************\r
-* 2013/10/19 Saar\r
-*        BLASTEST               : \r
+* 2013/10/20 Saar\r
+*        BLASTEST               : OK\r
 *        CTEST                  : OK\r
 *        TEST                   : OK\r
 \r
 *\r
 *\r
-* 2013/08/15 Saar\r
+* 2013/10/20 Saar\r
 * Parameter:\r
-*       SGEMM_DEFAULT_UNROLL_N  2\r
-*       SGEMM_DEFAULT_UNROLL_M  16\r
-*       SGEMM_DEFAULT_P         384\r
-*       SGEMM_DEFAULT_Q         168\r
+*       DGEMM_DEFAULT_UNROLL_N  2\r
+*       DGEMM_DEFAULT_UNROLL_M  16\r
+*       DGEMM_DEFAULT_P         192\r
+*       DGEMM_DEFAULT_Q         128\r
+*      A_PR1                   512\r
 *\r
-* BLASTEST: OK\r
-*\r
-* Performance:\r
-*       1 thread:       2.31 times faster than sandybridge\r
-*       4 threads:      2.26 times faster than sandybridge\r
-*\r
-* Compile for FMA3: OK\r
 *\r
+* Performance without prefetch of B:\r
+*       1 thread:       45.8 GFLOPS (MKL:  45)\r
+*       2 threads:      80.0 GFLOPS (MKL:  91)\r
+*       4 threads:     135.0 GFLOPS (MKL: 135)\r
 *********************************************************************/\r
 \r
 \r
 #endif\r
 \r
 \r
-#define        A_PR1   384\r
-#define        B_PR1   192\r
+#define        A_PR1   512\r
+#define        B_PR1   256\r
 \r
 /*******************************************************************************************\r
 * 3 lines of N\r
 *******************************************************************************************/\r
 \r
+.macro KERNEL16x3_SUBN\r
+       prefetcht0      A_PR1(AO)\r
+       vbroadcastsd    -12 * SIZE(BO), %ymm1\r
+       vmovaps         -16 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm4,%ymm1,%ymm0\r
+       vbroadcastsd    -11 * SIZE(BO), %ymm2\r
+       VFMADD231PD_    %ymm5,%ymm2,%ymm0\r
+       vbroadcastsd    -10 * SIZE(BO), %ymm3\r
+       VFMADD231PD_    %ymm6,%ymm3,%ymm0\r
+       vmovaps         -12 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm7,%ymm1,%ymm0\r
+       prefetcht0      A_PR1+64(AO)\r
+       VFMADD231PD_    %ymm8,%ymm2,%ymm0\r
+       VFMADD231PD_    %ymm9,%ymm3,%ymm0\r
+       vmovaps          -8 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm10,%ymm1,%ymm0\r
+       VFMADD231PD_    %ymm11,%ymm2,%ymm0\r
+       VFMADD231PD_    %ymm12,%ymm3,%ymm0\r
+       vmovaps          -4 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm13,%ymm1,%ymm0\r
+       VFMADD231PD_    %ymm14,%ymm2,%ymm0\r
+       VFMADD231PD_    %ymm15,%ymm3,%ymm0\r
+       addq    $3*SIZE , BO    \r
+       addq    $16*SIZE, AO\r
+.endm\r
+\r
+\r
+.macro KERNEL8x3_SUBN\r
+       //prefetcht0    A_PR1(AO)\r
+       vbroadcastsd    -12 * SIZE(BO), %ymm1\r
+       vmovaps         -16 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm4,%ymm1,%ymm0\r
+       vbroadcastsd    -11 * SIZE(BO), %ymm2\r
+       VFMADD231PD_    %ymm5,%ymm2,%ymm0\r
+       vbroadcastsd    -10 * SIZE(BO), %ymm3\r
+       VFMADD231PD_    %ymm6,%ymm3,%ymm0\r
+       vmovaps         -12 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm7,%ymm1,%ymm0\r
+       //prefetcht0    A_PR1+64(AO)\r
+       VFMADD231PD_    %ymm8,%ymm2,%ymm0\r
+       VFMADD231PD_    %ymm9,%ymm3,%ymm0\r
+       prefetcht0      B_PR1(BO)\r
+       addq    $3*SIZE , BO    \r
+       addq    $8*SIZE, AO\r
+.endm\r
+\r
+.macro KERNEL4x3_SUBN\r
+       vbroadcastsd    -12 * SIZE(BO), %ymm1\r
+       vmovaps         -16 * SIZE(AO), %ymm0\r
+       VFMADD231PD_    %ymm4,%ymm1,%ymm0\r
+       vbroadcastsd    -11 * SIZE(BO), %ymm2\r
+       VFMADD231PD_    %ymm5,%ymm2,%ymm0\r
+       vbroadcastsd    -10 * SIZE(BO), %ymm3\r
+       VFMADD231PD_    %ymm6,%ymm3,%ymm0\r
+       addq    $3*SIZE , BO    \r
+       addq    $4*SIZE, AO\r
+.endm\r
+\r
+.macro KERNEL2x3_SUBN\r
+       vmovsd  -12 * SIZE(BO), %xmm1\r
+       vmovsd  -16 * SIZE(AO), %xmm0\r
+       VFMADD231SD_    %xmm4,%xmm1,%xmm0\r
+       vmovsd  -11 * SIZE(BO), %xmm2\r
+       VFMADD231SD_    %xmm5,%xmm2,%xmm0\r
+       vmovsd  -10 * SIZE(BO), %xmm3\r
+       VFMADD231SD_    %xmm6,%xmm3,%xmm0\r
+       vmovsd  -15 * SIZE(AO), %xmm0\r
+       VFMADD231SD_    %xmm8,%xmm1,%xmm0\r
+       VFMADD231SD_    %xmm10,%xmm2,%xmm0\r
+       VFMADD231SD_    %xmm12,%xmm3,%xmm0\r
+       addq    $3*SIZE , BO    \r
+       addq    $2*SIZE, AO\r
+.endm\r
+\r
+.macro KERNEL1x3_SUBN\r
+       vmovsd  -12 * SIZE(BO), %xmm1\r
+       vmovsd  -16 * SIZE(AO), %xmm0\r
+       VFMADD231SD_    %xmm4,%xmm1,%xmm0\r
+       vmovsd  -11 * SIZE(BO), %xmm2\r
+       VFMADD231SD_    %xmm5,%xmm2,%xmm0\r
+       vmovsd  -10 * SIZE(BO), %xmm3\r
+       VFMADD231SD_    %xmm6,%xmm3,%xmm0\r
+       addq    $3*SIZE , BO    \r
+       addq    $1*SIZE, AO\r
+.endm\r
+\r
+\r
+\r
+\r
+\r
+\r
+/******************************************************************************************/\r
+\r
 .macro KERNEL16x3_1\r
        prefetcht0      A_PR1(AO, %rax, SIZE)\r
        vbroadcastsd     -6 * SIZE(BO, BI, SIZE), %ymm1\r
 \r
        \r
        movq    A, AO                   // aoffset = a\r
-       addq    $32 * SIZE, AO\r
+       addq    $16 * SIZE, AO\r
 \r
        movq    M,  I\r
        sarq    $4, I                   // i = (m >> 4)\r
 \r
 .L6_11:\r
         leaq    BUFFER1, BO             // first buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
+\r
+       prefetcht0      (CO1)\r
+       prefetcht0      (CO1,LDC,1)\r
+       prefetcht0      (CO1,LDC,2)\r
+       prefetcht0      64(CO1)\r
+       prefetcht0      64(CO1,LDC,1)\r
+       prefetcht0      64(CO1,LDC,2)\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax                       //  K = K - ( K % 8 )\r
+       sarq $1, %rax                   //  K / 8\r
        je      .L6_16\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $4, %rax                        // rax = rax * 16 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
-       ALIGN_4\r
+       ALIGN_5\r
 \r
 .L6_12:\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       KERNEL16x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL16x3_4\r
-\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL16x3_3\r
-       KERNEL16x3_4\r
-\r
-       je      .L6_16\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       KERNEL16x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL16x3_4\r
-\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL16x3_3\r
-       KERNEL16x3_4\r
-\r
-       je      .L6_16\r
-\r
-       jmp     .L6_12\r
-       ALIGN_4\r
+/*\r
+       prefetcht0      B_PR1(BO)\r
+       prefetcht0      B_PR1+64(BO)\r
+       prefetcht0      B_PR1+128(BO)\r
+*/\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+/*\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+*/\r
+       dec     %rax\r
+       jne     .L6_12\r
 \r
 .L6_16:\r
         movq    K, %rax\r
 \r
-       andq    $7, %rax                # if (k & 1)\r
+       andq    $1, %rax                # if (k & 1)\r
        je .L6_19\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $4, %rax                        // rax = rax * 16 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_17:\r
 \r
-       KERNEL16x3_SUB\r
+       KERNEL16x3_SUBN\r
 \r
-       jl      .L6_17\r
+       dec     %rax\r
+       jne     .L6_17\r
        ALIGN_4\r
 \r
 \r
 \r
 .L6_20_1:\r
         leaq    BUFFER1, BO             // first buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L6_20_6\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $3, %rax                        // rax = rax * 8 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_20_2:\r
 \r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       KERNEL8x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL8x3_4\r
-\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL8x3_3\r
-       KERNEL8x3_4\r
-\r
-       je      .L6_20_6\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       KERNEL8x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL8x3_4\r
-\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL8x3_3\r
-       KERNEL8x3_4\r
-\r
-       je      .L6_20_6\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
 \r
-       jmp     .L6_20_2\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       dec     %rax\r
+       jne     .L6_20_2\r
        ALIGN_4\r
 \r
 .L6_20_6:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L6_20_9\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $3, %rax                        // rax = rax * 8 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_20_7:\r
 \r
-       KERNEL8x3_SUB\r
+       KERNEL8x3_SUBN\r
 \r
-       jl      .L6_20_7\r
+       dec     %rax\r
+       jne     .L6_20_7\r
        ALIGN_4\r
 \r
 \r
 \r
 .L6_21:\r
         leaq    BUFFER1, BO             // first buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L6_26\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $2, %rax                        // rax = rax * 4 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_22:\r
 \r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       KERNEL4x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL4x3_4\r
-\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL4x3_3\r
-       KERNEL4x3_4\r
-\r
-       je      .L6_26\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       KERNEL4x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL4x3_4\r
-\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL4x3_3\r
-       KERNEL4x3_4\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
 \r
-       je      .L6_26\r
-\r
-       jmp     .L6_22\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       dec     %rax\r
+       jne     .L6_22\r
        ALIGN_4\r
 \r
 .L6_26:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L6_29\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $2, %rax                        // rax = rax * 4 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_27:\r
 \r
-       KERNEL4x3_SUB\r
+       KERNEL4x3_SUBN\r
 \r
-       jl      .L6_27\r
+       dec %rax\r
+       jne     .L6_27\r
        ALIGN_4\r
 \r
 \r
 \r
 .L6_31:\r
         leaq    BUFFER1, BO             // first buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L6_36\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $1, %rax                        // rax = rax *2 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_32:\r
 \r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
-\r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
-\r
-       je      .L6_36\r
-\r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
-\r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
-\r
-       je      .L6_36\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
 \r
-       jmp     .L6_32\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       dec %rax\r
+       jne     .L6_32\r
        ALIGN_4\r
 \r
 .L6_36:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L6_39\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-       \r
-       salq    $1, %rax                        // rax = rax *2 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_37:\r
 \r
-       KERNEL2x3_SUB\r
+       KERNEL2x3_SUBN\r
 \r
-       jl      .L6_37\r
+       dec %rax\r
+       jne     .L6_37\r
        ALIGN_4\r
 \r
 \r
 \r
 .L6_41:\r
         leaq    BUFFER1, BO             // first buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3,%rax\r
        je      .L6_46\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_42:\r
 \r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
-\r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
-\r
-       je      .L6_46\r
-\r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
 \r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
 \r
-       je      .L6_46\r
-\r
-       jmp     .L6_42\r
+       dec %rax\r
+       jne     .L6_42\r
        ALIGN_4\r
 \r
 .L6_46:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L6_49\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L6_47:\r
 \r
-       KERNEL1x3_SUB\r
+       KERNEL1x3_SUBN\r
 \r
-       jl      .L6_47\r
+       dec     %rax\r
+       jne     .L6_47\r
        ALIGN_4\r
 \r
 \r
 \r
        \r
        movq    A, AO                   // aoffset = a\r
-       addq    $32 * SIZE, AO\r
+       addq    $16 * SIZE, AO\r
 \r
        movq    M,  I\r
        sarq    $4, I                   // i = (m >> 4)\r
 \r
 .L7_11:\r
         leaq    BUFFER2, BO             // second buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
+\r
+       prefetcht0      (CO1)\r
+       prefetcht0      (CO1,LDC,1)\r
+       prefetcht0      (CO1,LDC,2)\r
+       prefetcht0      64(CO1)\r
+       prefetcht0      64(CO1,LDC,1)\r
+       prefetcht0      64(CO1,LDC,2)\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax                       //  K = K - ( K % 8 )\r
+       sarq $3, %rax                   // K / 8\r
        je      .L7_16\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $4, %rax                        // rax = rax * 16 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
-       ALIGN_4\r
+       ALIGN_5\r
 \r
 .L7_12:\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       KERNEL16x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL16x3_4\r
-\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL16x3_3\r
-       KERNEL16x3_4\r
-\r
-       je      .L7_16\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       KERNEL16x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL16x3_4\r
-\r
-       KERNEL16x3_1\r
-       KERNEL16x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL16x3_3\r
-       KERNEL16x3_4\r
-\r
-       je      .L7_16\r
-\r
-       jmp     .L7_12\r
+/*\r
+       prefetcht0      B_PR1(BO)\r
+       prefetcht0      B_PR1+64(BO)\r
+       prefetcht0      B_PR1+128(BO)\r
+*/\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       KERNEL16x3_SUBN\r
+       dec %rax\r
+       jne     .L7_12\r
        ALIGN_4\r
 \r
 .L7_16:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L7_19\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $4, %rax                        // rax = rax * 16 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
-       ALIGN_4\r
+       ALIGN_5\r
 \r
 .L7_17:\r
 \r
-       KERNEL16x3_SUB\r
+       KERNEL16x3_SUBN\r
 \r
-       jl      .L7_17\r
-       ALIGN_4\r
+       dec     %rax\r
+       jne     .L7_17\r
 \r
 \r
 .L7_19:\r
 \r
 .L7_20_1:\r
         leaq    BUFFER2, BO             // first buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L7_20_6\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $3, %rax                        // rax = rax * 8 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_20_2:\r
 \r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       KERNEL8x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL8x3_4\r
-\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL8x3_3\r
-       KERNEL8x3_4\r
-\r
-       je      .L7_20_6\r
-\r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       KERNEL8x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL8x3_4\r
-\r
-       KERNEL8x3_1\r
-       KERNEL8x3_2\r
-       prefetcht0      B_PR1+128(BO,BI,8)\r
-       KERNEL8x3_3\r
-       KERNEL8x3_4\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
 \r
-       je      .L7_20_6\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
+       KERNEL8x3_SUBN\r
 \r
-       jmp     .L7_20_2\r
+       dec %rax\r
+       jne     .L7_20_2\r
        ALIGN_4\r
 \r
 .L7_20_6:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L7_20_9\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $3, %rax                        // rax = rax * 8 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_20_7:\r
 \r
-       KERNEL8x3_SUB\r
+       KERNEL8x3_SUBN\r
 \r
-       jl      .L7_20_7\r
+       dec %rax\r
+       jne     .L7_20_7\r
        ALIGN_4\r
 \r
 .L7_20_9:\r
 \r
 .L7_21:\r
         leaq    BUFFER2, BO             // second buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L7_26\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $2, %rax                        // rax = rax * 4 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_22:\r
 \r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       KERNEL4x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL4x3_4\r
-\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL4x3_3\r
-       KERNEL4x3_4\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
 \r
-       je      .L7_26\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
+       KERNEL4x3_SUBN\r
 \r
-       prefetcht0      B_PR1(BO,BI,8)\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       KERNEL4x3_3\r
-       prefetcht0      B_PR1+64(BO,BI,8)\r
-       KERNEL4x3_4\r
-\r
-       KERNEL4x3_1\r
-       KERNEL4x3_2\r
-       prefetcht0      B_PR1+32(BO,BI,8)\r
-       KERNEL4x3_3\r
-       KERNEL4x3_4\r
-\r
-       je      .L7_26\r
-\r
-       jmp     .L7_22\r
+       dec %rax\r
+       jne     .L7_22\r
        ALIGN_4\r
 \r
 .L7_26:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L7_29\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       salq    $2, %rax                        // rax = rax * 4 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_27:\r
 \r
-       KERNEL4x3_SUB\r
+       KERNEL4x3_SUBN\r
 \r
-       jl      .L7_27\r
+       dec %rax\r
+       jne     .L7_27\r
        ALIGN_4\r
 \r
 \r
 \r
 .L7_31:\r
         leaq    BUFFER2, BO             // second buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L7_36\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       salq    $1, %rax                        // rax = rax *2 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_32:\r
 \r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
-\r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
-\r
-       je      .L7_36\r
-\r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
 \r
-       KERNEL2x3_1\r
-       KERNEL2x3_2\r
-       KERNEL2x3_3\r
-       KERNEL2x3_4\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
+       KERNEL2x3_SUBN\r
 \r
-       je      .L7_36\r
-\r
-       jmp     .L7_32\r
+       dec %rax\r
+       jne     .L7_32\r
        ALIGN_4\r
 \r
 .L7_36:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L7_39\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-       \r
-       salq    $1, %rax                        // rax = rax *2 ; number of values\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_37:\r
 \r
-       KERNEL2x3_SUB\r
+       KERNEL2x3_SUBN\r
 \r
-       jl      .L7_37\r
+       dec %rax\r
+       jne     .L7_37\r
        ALIGN_4\r
 \r
 \r
 \r
 .L7_41:\r
         leaq    BUFFER2, BO             // second buffer to BO\r
-        addq    $6 * SIZE, BO\r
+        addq    $12 * SIZE, BO\r
 \r
        vzeroall\r
 \r
         movq    K, %rax\r
 \r
-       andq    $-8, %rax\r
+       sarq    $3, %rax\r
        je      .L7_46\r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
 \r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_42:\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
 \r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
-\r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
-\r
-       je      .L7_46\r
-\r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
-\r
-       KERNEL1x3_1\r
-       KERNEL1x3_2\r
-       KERNEL1x3_3\r
-       KERNEL1x3_4\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
+       KERNEL1x3_SUBN\r
 \r
-       je      .L7_46\r
-\r
-       jmp     .L7_42\r
+       dec %rax\r
+       jne     .L7_42\r
        ALIGN_4\r
 \r
 .L7_46:\r
        andq    $7, %rax                # if (k & 1)\r
        je .L7_49\r
 \r
-       movq    %rax, BI                        //  Index for BO\r
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values\r
-\r
-       leaq    (AO, %rax, SIZE), AO\r
-       leaq    (BO, BI, SIZE), BO\r
-       negq    BI\r
-       negq    %rax\r
        ALIGN_4\r
 \r
 .L7_47:\r
 \r
-       KERNEL1x3_SUB\r
+       KERNEL1x3_SUBN\r
 \r
-       jl      .L7_47\r
+       dec %rax\r
+       jne     .L7_47\r
        ALIGN_4\r
 \r
 \r
diff --git a/param.h b/param.h
index e4b3871..38ac15c 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1164,6 +1164,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SWITCH_RATIO   4
 
 #ifdef ARCH_X86
+
 #define SGEMM_DEFAULT_UNROLL_M 4
 #define DGEMM_DEFAULT_UNROLL_M 2
 #define QGEMM_DEFAULT_UNROLL_M 2
@@ -1177,44 +1178,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
+
 #else
-#define SGEMM_DEFAULT_UNROLL_M 8
-#define DGEMM_DEFAULT_UNROLL_M 8
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 16
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_M 4
 #define XGEMM_DEFAULT_UNROLL_M 1
 
-#define SGEMM_DEFAULT_UNROLL_N 8
-#define DGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 2
 #define QGEMM_DEFAULT_UNROLL_N 2
-#define CGEMM_DEFAULT_UNROLL_N 4
-#define ZGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
+
 #endif
 
+#ifdef ARCH_X86
+
 #define SGEMM_DEFAULT_P 512
 #define SGEMM_DEFAULT_R sgemm_r
-//#define SGEMM_DEFAULT_R 1024
-
 #define DGEMM_DEFAULT_P 512
 #define DGEMM_DEFAULT_R dgemm_r
-//#define DGEMM_DEFAULT_R 1024
-
 #define QGEMM_DEFAULT_P 504
 #define QGEMM_DEFAULT_R qgemm_r
-
 #define CGEMM_DEFAULT_P 128
-//#define CGEMM_DEFAULT_R cgemm_r
 #define CGEMM_DEFAULT_R 1024
-
 #define ZGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_R zgemm_r
-//#define ZGEMM_DEFAULT_R 1024
-
 #define XGEMM_DEFAULT_P 252
 #define XGEMM_DEFAULT_R xgemm_r
-
 #define SGEMM_DEFAULT_Q 256
 #define DGEMM_DEFAULT_Q 256
 #define QGEMM_DEFAULT_Q 128
@@ -1222,7 +1218,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_Q 192
 #define XGEMM_DEFAULT_Q 128
 
-#define GETRF_FACTOR 0.72
+#else
+
+#define SGEMM_DEFAULT_P 768
+#define DGEMM_DEFAULT_P 192
+#define CGEMM_DEFAULT_P 384
+#define ZGEMM_DEFAULT_P 192
+
+#define SGEMM_DEFAULT_Q 168
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 168 
+#define ZGEMM_DEFAULT_Q 168
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+
+#define QGEMM_DEFAULT_Q 128
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+#define XGEMM_DEFAULT_Q 128
+
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 2
+#define ZGEMM3M_DEFAULT_UNROLL_M 8
+#endif
+
 
 #endif