optimization of sandybridge cgemm-kernel
authorwernsaar <wernsaar@googlemail.com>
Tue, 29 Jul 2014 17:07:21 +0000 (19:07 +0200)
committerwernsaar <wernsaar@googlemail.com>
Tue, 29 Jul 2014 17:07:21 +0000 (19:07 +0200)
kernel/x86_64/cgemm_kernel_8x2_sandy.S
param.h

index 564b733..c85646d 100644 (file)
@@ -25,6 +25,32 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
 **********************************************************************************/\r
 \r
+/*********************************************************************\r
+* 2014/07/29 Saar\r
+*        BLASTEST               : OK\r
+*        CTEST                  : OK\r
+*        TEST                   : OK\r
+*\r
+* 2013/10/28 Saar\r
+* Parameter:\r
+*       CGEMM_DEFAULT_UNROLL_N  2\r
+*       CGEMM_DEFAULT_UNROLL_M  8\r
+*       CGEMM_DEFAULT_P         768\r
+*       CGEMM_DEFAULT_Q         512\r
+*       A_PR1                   512\r
+*       B_PR1                   512\r
+*\r
+* 2014/07/29 Saar\r
+* Performance at 6192x6192x6192:\r
+*       1 thread:       49 GFLOPS       (MKL:   52)\r
+*       2 threads:      99 GFLOPS       (MKL:  102)\r
+*       3 threads:     148 GFLOPS       (MKL:  150)\r
+*       4 threads:     195 GFLOPS       (MKL:  194)\r
+*       8 threads:     354 GFLOPS       (MKL:  317)\r
+*\r
+*\r
+*********************************************************************/\r
+\r
 \r
 #define ASSEMBLER\r
 #include "common.h"\r
@@ -192,22 +218,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 \r
 /***************************************************************************************************************************/\r
 \r
-.macro KERNEL8x2_SUB\r
+.macro KERNEL8x2_1\r
 \r
         vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0\r
         vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4\r
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )\r
         vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1\r
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )\r
         vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5\r
+       prefetcht0      A_PR1(AO, %rax, SIZE)\r
+\r
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )\r
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6\r
         VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )\r
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7\r
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )\r
         VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )\r
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6\r
+\r
+\r
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )\r
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4\r
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )\r
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5\r
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )\r
+        vmovups           0 * SIZE(AO, %rax, SIZE), %ymm0\r
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )\r
+\r
+        vmovups           8 * SIZE(AO, %rax, SIZE), %ymm1\r
+       prefetcht0      A_PR1+64(AO, %rax, SIZE)\r
+\r
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )\r
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %ymm6\r
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )\r
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %ymm7\r
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )\r
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )\r
+\r
+\r
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )\r
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %ymm4\r
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )\r
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %ymm5\r
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )\r
+        vmovups          16 * SIZE(AO, %rax, SIZE), %ymm0\r
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )\r
+\r
+        vmovups          24 * SIZE(AO, %rax, SIZE), %ymm1\r
+       prefetcht0      A_PR1+128(AO, %rax, SIZE)\r
+\r
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )\r
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %ymm6\r
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )\r
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %ymm7\r
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )\r
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )\r
+\r
+\r
         VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )\r
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %ymm4\r
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )\r
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %ymm5\r
         VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )\r
+        vmovups          32 * SIZE(AO, %rax, SIZE), %ymm0\r
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )\r
+\r
+        vmovups          40 * SIZE(AO, %rax, SIZE), %ymm1\r
+       prefetcht0      A_PR1+192(AO, %rax, SIZE)\r
+\r
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )\r
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %ymm6\r
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )\r
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %ymm7\r
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )\r
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )\r
+\r
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )\r
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )\r
+        addq    $ 16, BI                           \r
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )\r
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )\r
+\r
+        addq    $ 64, %rax                         \r
+.endm\r
+\r
+\r
+.macro KERNEL8x2_SUB\r
+\r
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0\r
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1\r
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4\r
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5\r
+\r
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )\r
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6\r
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )\r
         vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7\r
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )\r
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )\r
+\r
+\r
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )\r
         VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )\r
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )\r
         VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )\r
+\r
         addq    $ 4 , BI                           \r
         addq    $ 16, %rax                         \r
 .endm\r
@@ -984,47 +1096,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 \r
 .L2_8_12:\r
 \r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
        prefetcht0      B_PR1(BO,BI,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
+       KERNEL8x2_1\r
 \r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
        prefetcht0      B_PR1(BO,BI,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
+       KERNEL8x2_1\r
 \r
        je      .L2_8_16\r
 \r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
        prefetcht0      B_PR1(BO,BI,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
+       KERNEL8x2_1\r
 \r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
        prefetcht0      B_PR1(BO,BI,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
-       prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       KERNEL8x2_SUB\r
+       KERNEL8x2_1\r
 \r
        je      .L2_8_16\r
 \r
@@ -1152,7 +1236,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .L2_4_12:\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL4x2_SUB\r
        KERNEL4x2_SUB\r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
@@ -1160,7 +1243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL4x2_SUB\r
        KERNEL4x2_SUB\r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
@@ -1170,7 +1252,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        je      .L2_4_16\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL4x2_SUB\r
        KERNEL4x2_SUB\r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
@@ -1178,7 +1259,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL4x2_SUB\r
        KERNEL4x2_SUB\r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
@@ -1305,14 +1385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .L2_4_22:\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
@@ -1321,14 +1399,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        je      .L2_4_26\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
        KERNEL2x2_SUB\r
@@ -1507,13 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .L2_4_42:\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
 \r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
@@ -1522,13 +1596,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        je      .L2_4_46\r
 \r
        prefetcht0      A_PR1(AO,%rax,SIZE)\r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
 \r
-       prefetcht0      B_PR1(BO,BI,SIZE)\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
        KERNEL1x2_SUB\r
diff --git a/param.h b/param.h
index c545d21..82f4ad8 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1134,9 +1134,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define QGEMM_DEFAULT_P 504
 #define QGEMM_DEFAULT_R qgemm_r
 
-#define CGEMM_DEFAULT_P 384
-//#define CGEMM_DEFAULT_R cgemm_r
-#define CGEMM_DEFAULT_R 1024
+#define CGEMM_DEFAULT_P 768
+#define CGEMM_DEFAULT_R cgemm_r
+//#define CGEMM_DEFAULT_R 1024
 
 #define ZGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_R zgemm_r
@@ -1148,7 +1148,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 384
 #define DGEMM_DEFAULT_Q 256
 #define QGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 192
+#define CGEMM_DEFAULT_Q 512
 #define ZGEMM_DEFAULT_Q 192
 #define XGEMM_DEFAULT_Q 128