From e7c969e164900ed19461566af1a2201f98bc0a36 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 13 Jun 2015 16:16:29 +0200 Subject: [PATCH] added optimized dtrmm_kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 +- kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 1546 ++++++++++++++++++++++++++++++ 2 files changed, 1547 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dtrmm_kernel_4x8_haswell.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index a01dc1a..a4686de 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -40,7 +40,7 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRMMKERNEL = ../generic/trmmkernel_4x8.c +DTRMMKERNEL = dtrmm_kernel_4x8_haswell.c DGEMMKERNEL = dgemm_kernel_4x8_haswell.S DGEMMINCOPY = ../generic/gemm_ncopy_4.c DGEMMITCOPY = ../generic/gemm_tcopy_4.c diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c new file mode 100644 index 0000000..504c784 --- /dev/null +++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c @@ -0,0 +1,1546 @@ +#include "common.h" +#include + + +static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline)); + +static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) +{ + + BLASLONG I = 0; + BLASLONG temp1 = n * 8; + + __asm__ __volatile__ + ( + " vxorpd %%ymm4 , %%ymm4 , %%ymm4 \n\t" + " vxorpd %%ymm5 , %%ymm5 , %%ymm5 \n\t" + " vxorpd %%ymm6 , %%ymm6 , %%ymm6 \n\t" + " vxorpd %%ymm7 , %%ymm7 , %%ymm7 \n\t" + " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" + " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" + " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + + " cmp $0, %1 \n\t" + " jz 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + " vmovups (%2,%0,4) , %%ymm0 \n\t" + " vmovups (%3,%0,8) , %%ymm1 \n\t" + " vmovups 32(%3,%0,8) , %%ymm2 \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" + + " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" + + " addq $8 , %0 \n\t" + " cmp %0 , %1 \n\t" + " jne 1b \n\t" + + "2: \n\t" + + " vbroadcastsd (%4), %%ymm0 \n\t" + + " vmulpd %%ymm0 , %%ymm4 , %%ymm4 \n\t" + " vmulpd %%ymm0 , %%ymm5 , %%ymm5 \n\t" + " vmulpd %%ymm0 , %%ymm6 , %%ymm6 \n\t" + " vmulpd %%ymm0 , %%ymm7 , %%ymm7 \n\t" + " vmulpd %%ymm0 , %%ymm8 , %%ymm8 \n\t" + " vmulpd %%ymm0 , %%ymm9 , %%ymm9 \n\t" + " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" + " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" + + " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" + " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" + + " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" + + " vmovups %%ymm4 , (%5) \n\t" + " vmovups %%ymm5 , (%6) \n\t" + " vmovups %%ymm6 , (%7) \n\t" + " vmovups %%ymm7 , (%8) \n\t" + + " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" + " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + + " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" + + " vmovups %%ymm4 , (%9) \n\t" + " vmovups %%ymm5 , (%10) \n\t" + " vmovups %%ymm6 , (%11) \n\t" + " vmovups %%ymm7 , (%12) \n\t" + + : + : + "a" (I), // 0 + "r" (temp1), // 1 + "S" (a), // 2 + "D" (b), // 3 + "r" (alpha), // 4 + "r" (C0), // 5 + "r" (C1), // 6 + "r" (C2), // 7 + "r" (C3), // 8 + "r" (C4), // 9 + "r" (C5), // 10 + "r" (C6), // 11 + "r" (C7) // 12 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + +} + + + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT res4_0; + FLOAT res4_1; + FLOAT res4_2; + FLOAT res4_3; + + FLOAT res5_0; + FLOAT res5_1; + FLOAT res5_2; + FLOAT res5_3; + + FLOAT res6_0; + FLOAT res6_1; + FLOAT res6_2; + FLOAT res6_3; + + FLOAT res7_0; + FLOAT res7_1; + FLOAT res7_2; + FLOAT res7_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + FLOAT b4; + FLOAT b5; + FLOAT b6; + FLOAT b7; + + BLASLONG off, temp ; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + + for (j=0; j