Merge pull request #877 from jeromerobert/bug873
authorZhang Xianyi <traits.zhang@gmail.com>
Mon, 16 May 2016 15:21:56 +0000 (23:21 +0800)
committerZhang Xianyi <traits.zhang@gmail.com>
Mon, 16 May 2016 15:21:56 +0000 (23:21 +0800)
Disable multi-threading in swap

Makefile.power
common_power.h
kernel/power/dgemm_logic_16x4_power8.S
kernel/power/dgemm_macros_16x4_power8.S
param.h

index 48bcb77..589d674 100644 (file)
@@ -13,10 +13,10 @@ endif
 
 ifeq ($(CORE), POWER8)
 ifeq ($(USE_OPENMP), 1)
-COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -DUSE_OPENMP -fno-fast-math -fopenmp
+COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
 FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
 else
-COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DALLOC_SHM -fno-fast-math
+COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
 FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
 endif
 endif
index b62aca3..e3a1a7a 100644 (file)
@@ -803,7 +803,7 @@ Lmcount$lazy_ptr:
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
 #elif defined(POWER8)
-#define BUFFER_SIZE     ( 32 << 20)
+#define BUFFER_SIZE     ( 64 << 20)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
index 718f80b..edfcc4b 100644 (file)
@@ -39,14 +39,153 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 LDGEMM_L4_BEGIN:
 
-       mr              CO,     C
+       li              T1,     128
+       li              T2,     256
        mr              AO,     A
-       slwi            T1,     LDC     ,       2
-       add             C,      C,      T1
+
+       mr              CO,     C
+       slwi            T3,     LDC     ,       2
+       add             C,      C,      T3
+
+       dcbt            A,      T1
+       dcbt            A,      T2
+
        srawi.          I,      M,      4
        ble             LDGEMM_L4x16_END
 
        .align 4
+LDGEMM_L4x16_BEGIN_FIRST:
+
+       li              L,      -128
+
+       mr              T1,     CO
+       add             T2,     T1,     LDC
+       add             T3,     T2,     LDC
+       add             T4,     T3,     LDC
+
+       and             T1,     T1,     L
+       and             T2,     T2,     L
+       and             T3,     T3,     L
+       and             T4,     T4,     L
+
+       dcbt            T1,     r0
+       dcbt            T2,     r0
+       dcbt            T3,     r0
+       dcbt            T4,     r0
+
+       mr              BO,     B
+       srawi.          L,      K,      2
+
+       addi            T1, T1, 128
+       addi            T2, T2, 128
+       addi            T3, T3, 128
+       addi            T4, T4, 128
+
+       dcbt            T1,     r0
+       dcbt            T2,     r0
+       dcbt            T3,     r0
+       dcbt            T4,     r0
+       
+       ble             LDGEMM_L4x16_SUB0_FIRST
+       cmpwi           cr0,    L,      1
+       ble             LDGEMM_L4x16_SUB4_FIRST
+
+       .align 4
+LDGEMM_L4x16_LOOP_START_FIRST:
+
+       li      T2,     512
+       li      o40,    40
+       li      o56,    56
+
+       dcbt    AO,     PRE
+       dcbt    BO,     T2
+       LOAD4x16_1
+       dcbt    AO,     PRE
+       KERNEL4x16_I1
+       dcbt    AO,     PRE
+       addic.          L,      L,      -2
+       KERNEL4x16_L2
+
+       dcbt    AO,     PRE
+       KERNEL4x16_L1
+       dcbt    AO,     PRE
+       dcbt    BO,     T2
+       KERNEL4x16_L2
+
+       ble             LDGEMM_L4x16_LOOP_END_FIRST
+       mtctr           L
+
+       .align 4
+
+LDGEMM_L4x16_LOOP_FIRST:
+
+       dcbt    AO,     PRE
+       KERNEL4x16_L1
+       dcbt    AO,     PRE
+       KERNEL4x16_L2
+
+       dcbt    AO,     PRE
+       KERNEL4x16_L1
+       dcbt    AO,     PRE
+       dcbt    BO,     T2
+       KERNEL4x16_L2
+
+       bdnz            LDGEMM_L4x16_LOOP_FIRST
+
+       .align 4
+
+LDGEMM_L4x16_LOOP_END_FIRST:
+
+       KERNEL4x16_L1
+       KERNEL4x16_L2
+
+       KERNEL4x16_1
+       KERNEL4x16_E2
+
+       b               LDGEMM_L4x16_SUB1_FIRST
+
+LDGEMM_L4x16_SUB4_FIRST:
+
+       KERNEL4x16_SUBI1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+
+       b               LDGEMM_L4x16_SUB1_FIRST
+
+LDGEMM_L4x16_SUB0_FIRST:
+
+       andi.           L,      K,      3
+
+       KERNEL4x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             LDGEMM_L4x16_SAVE_FIRST
+       b               LDGEMM_L4x16_SUB2_FIRST
+
+LDGEMM_L4x16_SUB1_FIRST:
+
+       andi.           L,      K,      3
+       ble             LDGEMM_L4x16_SAVE_FIRST
+
+LDGEMM_L4x16_SUB2_FIRST:
+
+       KERNEL4x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             LDGEMM_L4x16_SUB2_FIRST
+
+       .align 4
+LDGEMM_L4x16_SAVE_FIRST:
+
+       SAVE4x16
+
+       addic.          I,      I,      -1
+       ble             LDGEMM_L4x16_END
+
+LDGEMM_L4x16_END_FIRST:
+
+       .align 4
 LDGEMM_L4x16_BEGIN:
 
        li              L,      -128
@@ -79,9 +218,9 @@ LDGEMM_L4x16_BEGIN:
        dcbt            T3,     r0
        dcbt            T4,     r0
        
-       ble             LDGEMM_L4x16_SUB0
+       ble-            LDGEMM_L4x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             LDGEMM_L4x16_SUB4
+       ble-            LDGEMM_L4x16_SUB4
 
        .align 4
 LDGEMM_L4x16_LOOP_START:
@@ -97,7 +236,8 @@ LDGEMM_L4x16_LOOP_START:
        addic.          L,      L,      -2
        KERNEL4x16_L2
 
-       ble             LDGEMM_L4x16_LOOP_END
+       ble-            LDGEMM_L4x16_LOOP_END
+       mtctr           L
 
        .align 4
 
@@ -107,10 +247,10 @@ LDGEMM_L4x16_LOOP:
        dcbt    AO,     PRE
        KERNEL4x16_L1
        dcbt    AO,     PRE
-       addic.          L,      L,      -1
+       // addic.               L,      L,      -1
        KERNEL4x16_L2
 
-       bgt             LDGEMM_L4x16_LOOP
+       bdnz+           LDGEMM_L4x16_LOOP
 
        .align 4
 
@@ -156,7 +296,7 @@ LDGEMM_L4x16_SAVE:
        SAVE4x16
 
        addic.          I,      I,      -1
-       bgt             LDGEMM_L4x16_BEGIN
+       bgt+            LDGEMM_L4x16_BEGIN
 
 LDGEMM_L4x16_END:
 
index 2c78512..5be517f 100644 (file)
@@ -559,10 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro SAVE4x16
 
-       mr              T1,     CO
-       add             T2,     T1,     LDC
-       add             T3,     T2,     LDC
-       add             T4,     T3,     LDC
+       add             T2,     CO,     LDC
 
        lxvd2x          vs0,    0,      CO
        lxvd2x          vs1,    o16,    CO
@@ -570,6 +567,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        lxvd2x          vs3,    o48,    CO
        lxvd2x          vs4,    o64,    CO
        lxvd2x          vs5,    o80,    CO
+       add             T3,     T2,     LDC
        lxvd2x          vs6,    o96,    CO
        lxvd2x          vs7,    o112,   CO
 
@@ -579,6 +577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        lxvd2x          vs11,   o48,    T2
        lxvd2x          vs12,   o64,    T2
        lxvd2x          vs13,   o80,    T2
+       add             T4,     T3,     LDC
        lxvd2x          vs14,   o96,    T2
        lxvd2x          vs15,   o112,   T2
 
@@ -592,21 +591,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        lxvd2x          vs31,   o112,   T3
 
        xvmaddadp       vs0,    vs32,   alpha_r
-       xvmaddadp       vs1,    vs33,   alpha_r
-       xvmaddadp       vs2,    vs34,   alpha_r
-       xvmaddadp       vs3,    vs35,   alpha_r
-       xvmaddadp       vs4,    vs36,   alpha_r
-       xvmaddadp       vs5,    vs37,   alpha_r
-       xvmaddadp       vs6,    vs38,   alpha_r
-       xvmaddadp       vs7,    vs39,   alpha_r
-
        lxvd2x          vs32,   0,      T4
+       xvmaddadp       vs1,    vs33,   alpha_r
        lxvd2x          vs33,   o16,    T4
+       xvmaddadp       vs2,    vs34,   alpha_r
        lxvd2x          vs34,   o32,    T4
+       xvmaddadp       vs3,    vs35,   alpha_r
        lxvd2x          vs35,   o48,    T4
+       xvmaddadp       vs4,    vs36,   alpha_r
        lxvd2x          vs36,   o64,    T4
+       xvmaddadp       vs5,    vs37,   alpha_r
        lxvd2x          vs37,   o80,    T4
+       xvmaddadp       vs6,    vs38,   alpha_r
        lxvd2x          vs38,   o96,    T4
+       xvmaddadp       vs7,    vs39,   alpha_r
        lxvd2x          vs39,   o112,   T4
 
        xvmaddadp       vs8,    vs40,   alpha_r
@@ -614,58 +612,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmaddadp       vs10,   vs42,   alpha_r
        xvmaddadp       vs11,   vs43,   alpha_r
 
-       stxvd2x         vs0,    0,      T1
-       stxvd2x         vs1,    o16,    T1
-       stxvd2x         vs2,    o32,    T1
-       stxvd2x         vs3,    o48,    T1
-
        xvmaddadp       vs12,   vs44,   alpha_r
        xvmaddadp       vs13,   vs45,   alpha_r
        xvmaddadp       vs14,   vs46,   alpha_r
        xvmaddadp       vs15,   vs47,   alpha_r
 
-       stxvd2x         vs4,    o64,    T1
-       stxvd2x         vs5,    o80,    T1
-       stxvd2x         vs6,    o96,    T1
-       stxvd2x         vs7,    o112,   T1
-
        xvmaddadp       vs24,   vs48,   alpha_r
        xvmaddadp       vs25,   vs49,   alpha_r
        xvmaddadp       vs26,   vs50,   alpha_r
        xvmaddadp       vs27,   vs51,   alpha_r
 
-       stxvd2x         vs8,    o0,     T2
-       stxvd2x         vs9,    o16,    T2
-       stxvd2x         vs10,   o32,    T2
-       stxvd2x         vs11,   o48,    T2
-
        xvmaddadp       vs28,   vs52,   alpha_r
        xvmaddadp       vs29,   vs53,   alpha_r
        xvmaddadp       vs30,   vs54,   alpha_r
        xvmaddadp       vs31,   vs55,   alpha_r
 
-       stxvd2x         vs12,   o64,    T2
-       stxvd2x         vs13,   o80,    T2
-       stxvd2x         vs14,   o96,    T2
-       stxvd2x         vs15,   o112,   T2
+       stxvd2x         vs0,    0,      CO
+       stxvd2x         vs1,    o16,    CO
+       stxvd2x         vs2,    o32,    CO
+       stxvd2x         vs3,    o48,    CO
+
+       stxvd2x         vs4,    o64,    CO
+       stxvd2x         vs5,    o80,    CO
+       stxvd2x         vs6,    o96,    CO
+       stxvd2x         vs7,    o112,   CO
 
        xvmaddadp       vs32,   vs56,   alpha_r
        xvmaddadp       vs33,   vs57,   alpha_r
        xvmaddadp       vs34,   vs58,   alpha_r
        xvmaddadp       vs35,   vs59,   alpha_r
 
-       stxvd2x         vs24,   0,      T3
-       stxvd2x         vs25,   o16,    T3
-       stxvd2x         vs26,   o32,    T3
-       stxvd2x         vs27,   o48,    T3
-
        xvmaddadp       vs36,   vs60,   alpha_r
        xvmaddadp       vs37,   vs61,   alpha_r
        xvmaddadp       vs38,   vs62,   alpha_r
        xvmaddadp       vs39,   vs63,   alpha_r
 
+       addi            CO,     CO,     128
+
+       stxvd2x         vs8,    o0,     T2
+       stxvd2x         vs9,    o16,    T2
+       stxvd2x         vs10,   o32,    T2
+       stxvd2x         vs11,   o48,    T2
+
+       stxvd2x         vs12,   o64,    T2
+       stxvd2x         vs13,   o80,    T2
+       stxvd2x         vs14,   o96,    T2
+       stxvd2x         vs15,   o112,   T2
+
+       stxvd2x         vs24,   0,      T3
+       stxvd2x         vs25,   o16,    T3
        stxvd2x         vs28,   o64,    T3
        stxvd2x         vs29,   o80,    T3
+
+       stxvd2x         vs26,   o32,    T3
+       stxvd2x         vs27,   o48,    T3
        stxvd2x         vs30,   o96,    T3
        stxvd2x         vs31,   o112,   T3
 
@@ -674,8 +674,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        stxvd2x         vs34,   o32,    T4
        stxvd2x         vs35,   o48,    T4
 
-       addi            CO,     CO,     128
-
        stxvd2x         vs36,   o64,    T4
        stxvd2x         vs37,   o80,    T4
        stxvd2x         vs38,   o96,    T4
diff --git a/param.h b/param.h
index abe739a..67f0578 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1965,8 +1965,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DNUMOPT                8
 
 #define GEMM_DEFAULT_OFFSET_A 0 
-#define GEMM_DEFAULT_OFFSET_B 4096
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define GEMM_DEFAULT_OFFSET_B 65536
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
@@ -1983,7 +1983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_P  320
 
 #define SGEMM_DEFAULT_Q  640
-#define DGEMM_DEFAULT_Q  640
+#define DGEMM_DEFAULT_Q  720
 #define CGEMM_DEFAULT_Q  640
 #define ZGEMM_DEFAULT_Q  640