optimized dtrsm_kernel_LT for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Sun, 22 May 2016 13:20:04 +0000 (15:20 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Sun, 22 May 2016 13:20:04 +0000 (15:20 +0200)
kernel/power/dtrsm_kernel_LT_16x4_power8.S
kernel/power/dtrsm_logic_LT_16x4_power8.S

index e1c6249..fdfc5ac 100644 (file)
        li      o24,    24
        li      o32,    32
        li      o48,    48
+       li      PRE,    384
 
        mr      KK,     OFFSET
 
index d5d34b4..540a640 100644 (file)
@@ -18,6 +18,33 @@ DSTRM_LT_L4x16_BEGIN:
 
        mr              BO,     B
 
+       li              L,      -128
+       
+       mr              T1,     CO
+        add             T2,     T1,     LDC
+        add             T3,     T2,     LDC
+        add             T4,     T3,     LDC
+
+        and             T1,     T1,     L
+        and             T2,     T2,     L
+        and             T3,     T3,     L
+        and             T4,     T4,     L
+
+        dcbt            T1,     r0
+        dcbt            T2,     r0
+        dcbt            T3,     r0
+        dcbt            T4,     r0
+
+        addi            T1, T1, 128
+        addi            T2, T2, 128
+        addi            T3, T3, 128
+        addi            T4, T4, 128
+
+        dcbt            T1,     r0
+        dcbt            T2,     r0
+        dcbt            T3,     r0
+        dcbt            T4,     r0
+
 
 DSTRM_LT_L4x16_LOOP_START:
 
@@ -26,15 +53,30 @@ DSTRM_LT_L4x16_LOOP_START:
 
 
        addic.          L,      KK,     0
-       ble             DSTRM_LT_L4x16_SAVE
+       ble-            DSTRM_LT_L4x16_SAVE
 
 DSTRM_LT_L4x16_LOOP:
 
+       dcbt            AO,     PRE
+       dcbt            BO,     PRE
+       KERNEL_16x4
+       addic.          L,      L,      -1
+       ble-            DSTRM_LT_L4x16_SAVE
+
+       dcbt            AO,     PRE
+       KERNEL_16x4
+       addic.          L,      L,      -1
+       ble-            DSTRM_LT_L4x16_SAVE
 
+       dcbt            AO,     PRE
        KERNEL_16x4
+       addic.          L,      L,      -1
+       ble-            DSTRM_LT_L4x16_SAVE
 
+       dcbt            AO,     PRE
+       KERNEL_16x4
        addic.          L,      L,      -1
-       bgt             DSTRM_LT_L4x16_LOOP
+       bgt+            DSTRM_LT_L4x16_LOOP
 
 
 DSTRM_LT_L4x16_SAVE: