optimized dgemm for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Wed, 27 Apr 2016 12:01:08 +0000 (14:01 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Wed, 27 Apr 2016 12:01:08 +0000 (14:01 +0200)
kernel/power/KERNEL.POWER8
kernel/power/dgemm_kernel_16x4_power8.S
kernel/power/dgemm_logic_16x4_power8.S
kernel/power/dgemm_macros_16x4_power8.S
kernel/power/dgemm_tcopy_16_power8.S
kernel/power/dgemm_tcopy_logic_16_power8.S
kernel/power/dtrmm_kernel_16x4_power8.S
kernel/power/dtrmm_macros_16x4_power8.S [new file with mode: 0644]
param.h

index 0b6a7f3..fb07ccf 100644 (file)
@@ -21,12 +21,12 @@ SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 DGEMMKERNEL    =  dgemm_kernel_16x4_power8.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
-DGEMMONCOPY    =  gemm_ncopy_4.S
-DGEMMOTCOPY    =  gemm_tcopy_4.S
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ =  dgemm_incopy.o
+DGEMMITCOPYOBJ =  dgemm_itcopy.o
+DGEMMONCOPYOBJ =  dgemm_oncopy.o
+DGEMMOTCOPYOBJ =  dgemm_otcopy.o
 
 CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
index 4c14b0c..bcc6ce3 100644 (file)
@@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0     0
 
+#define T4     r12
+#define T3     r11
+
 #define o8     r15
 #define o24    r16
 #define ALPHA  r17
@@ -265,7 +268,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi    ALPHA, SP, 224
 #endif
 
-       li      PRE, 256 
+       li      PRE, 384 
        li      o8 , 8
        li      o16, 16
        li      o24, 24
index 49c438f..4ad3387 100644 (file)
@@ -35,160 +35,154 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        srawi.          J,      N,      2
-       ble             .LDGEMM_L4_END
+       ble             LDGEMM_L4_END
 
-.LDGEMM_L4_BEGIN:
+LDGEMM_L4_BEGIN:
 
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       2
        add             C,      C,      T1
        srawi.          I,      M,      4
-       ble             .LDGEMM_L4x16_END
+       ble             LDGEMM_L4x16_END
 
-.LDGEMM_L4x16_BEGIN:
+       .align 5
+LDGEMM_L4x16_BEGIN:
+
+       li              T4,     -128
+
+       and             T1,     CO,     T4
+       add             T2,     T1,     LDC
+       add             T3,     T2,     LDC
+       add             T4,     T3,     LDC
+
+       dcbt            T1,     r0
+       dcbt            T2,     r0
+       dcbt            T3,     r0
+       dcbt            T4,     r0
 
+       andi.           cr0,    CO,     127
+       ble             LDGEMM_L4x16_BEGIN_NOPRE
+
+       addi            T1, T1, 128
+       addi            T2, T2, 128
+       addi            T3, T3, 128
+       addi            T4, T4, 128
+
+       dcbt            T1,     r0
+       dcbt            T2,     r0
+       dcbt            T3,     r0
+       dcbt            T4,     r0
+       
+
+LDGEMM_L4x16_BEGIN_NOPRE:
 
        mr              BO,     B
-       srawi.          L,      K,      3
-       ble             .LDGEMM_L4x16_SUB0
+       srawi.          L,      K,      2
+       ble             LDGEMM_L4x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L4x16_SUB4
+       ble             LDGEMM_L4x16_SUB4
 
-.LDGEMM_L4x16_LOOP_START:
+       .align 5
+LDGEMM_L4x16_LOOP_START:
 
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        LOAD4x16_1
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_I1
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_2
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_1
-       dcbt            AO,     PRE
-       KERNEL4x16_2
-
-       dcbt            AO,     PRE
-       KERNEL4x16_1
-       dcbt            AO,     PRE
-       KERNEL4x16_2
-       dcbt            AO,     PRE
-       KERNEL4x16_1
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L4x16_LOOP_END
+       ble             LDGEMM_L4x16_LOOP_END
 
-       .align 5
+       .align 7
 
-.LDGEMM_L4x16_LOOP:
-
-       dcbt            AO,     PRE
-       KERNEL4x16_1
-       dcbt            AO,     PRE
-       KERNEL4x16_2
-       dcbt            AO,     PRE
-       KERNEL4x16_1
-       dcbt            AO,     PRE
-       KERNEL4x16_2
+LDGEMM_L4x16_LOOP:
 
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_1
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_2
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_1
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x16_LOOP
+       bgt             LDGEMM_L4x16_LOOP
 
-.LDGEMM_L4x16_LOOP_END:
+       .align 5
+LDGEMM_L4x16_LOOP_END:
 
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_1
-       dcbt            AO,     PRE
-       KERNEL4x16_2
-       dcbt            AO,     PRE
-       KERNEL4x16_1
-       dcbt            AO,     PRE
+       dcbt    AO,     PRE
        KERNEL4x16_2
-
-       dcbt            AO,     PRE
-       KERNEL4x16_1
-       dcbt            AO,     PRE
-       KERNEL4x16_2
-       dcbt            AO,     PRE
        KERNEL4x16_1
        KERNEL4x16_E2
 
-       b               .LDGEMM_L4x16_SUB1
+       b               LDGEMM_L4x16_SUB1
 
-.LDGEMM_L4x16_SUB4:
+LDGEMM_L4x16_SUB4:
 
-       dcbt            AO,     PRE
        KERNEL4x16_SUBI1
-       dcbt            AO,     PRE
        KERNEL4x16_SUB1
-       dcbt            AO,     PRE
        KERNEL4x16_SUB1
-       dcbt            AO,     PRE
        KERNEL4x16_SUB1
 
-       KERNEL4x16_SUB1
-       KERNEL4x16_SUB1
-       KERNEL4x16_SUB1
-       KERNEL4x16_SUB1
+       b               LDGEMM_L4x16_SUB1
 
-       b               .LDGEMM_L4x16_SUB1
+LDGEMM_L4x16_SUB0:
 
-.LDGEMM_L4x16_SUB0:
-
-       andi.           L,      K,      7
+       andi.           L,      K,      3
 
        KERNEL4x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L4x16_SAVE
-       b               .LDGEMM_L4x16_SUB2
+       ble             LDGEMM_L4x16_SAVE
+       b               LDGEMM_L4x16_SUB2
 
-.LDGEMM_L4x16_SUB1:
+LDGEMM_L4x16_SUB1:
 
-       andi.           L,      K,      7
-       ble             .LDGEMM_L4x16_SAVE
+       andi.           L,      K,      3
+       ble             LDGEMM_L4x16_SAVE
 
-.LDGEMM_L4x16_SUB2:
+LDGEMM_L4x16_SUB2:
 
        KERNEL4x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x16_SUB2
+       bgt             LDGEMM_L4x16_SUB2
 
-.LDGEMM_L4x16_SAVE:
+       .align 5
+LDGEMM_L4x16_SAVE:
 
        SAVE4x16
 
        addic.          I,      I,      -1
-       bgt             .LDGEMM_L4x16_BEGIN
+       bgt             LDGEMM_L4x16_BEGIN
 
-.LDGEMM_L4x16_END:
+LDGEMM_L4x16_END:
 
-.LDGEMM_L4x8_BEGIN:
+LDGEMM_L4x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LDGEMM_L4x1_END
+       ble             LDGEMM_L4x1_END
 
        andi.           T1,     M,      8
-       ble             .LDGEMM_L4x8_END
+       ble             LDGEMM_L4x8_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L4x8_SUB0
+       ble             LDGEMM_L4x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L4x8_SUB4
+       ble             LDGEMM_L4x8_SUB4
 
-.LDGEMM_L4x8_LOOP_START:
+LDGEMM_L4x8_LOOP_START:
 
        LOAD4x8_1
        KERNEL4x8_I1
@@ -202,11 +196,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L4x8_LOOP_END
+       ble             LDGEMM_L4x8_LOOP_END
 
        .align 5
 
-.LDGEMM_L4x8_LOOP:
+LDGEMM_L4x8_LOOP:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -219,9 +213,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x8_LOOP
+       bgt             LDGEMM_L4x8_LOOP
 
-.LDGEMM_L4x8_LOOP_END:
+LDGEMM_L4x8_LOOP_END:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -233,9 +227,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_1
        KERNEL4x8_E2
 
-       b               .LDGEMM_L4x8_SUB1
+       b               LDGEMM_L4x8_SUB1
 
-.LDGEMM_L4x8_SUB4:
+LDGEMM_L4x8_SUB4:
 
        KERNEL4x8_SUBI1
        KERNEL4x8_SUB1
@@ -247,48 +241,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_SUB1
        KERNEL4x8_SUB1
 
-       b               .LDGEMM_L4x8_SUB1
+       b               LDGEMM_L4x8_SUB1
 
-.LDGEMM_L4x8_SUB0:
+LDGEMM_L4x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L4x8_SAVE
-       b               .LDGEMM_L4x8_SUB2
+       ble             LDGEMM_L4x8_SAVE
+       b               LDGEMM_L4x8_SUB2
 
-.LDGEMM_L4x8_SUB1:
+LDGEMM_L4x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L4x8_SAVE
+       ble             LDGEMM_L4x8_SAVE
 
-.LDGEMM_L4x8_SUB2:
+LDGEMM_L4x8_SUB2:
 
        KERNEL4x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x8_SUB2
+       bgt             LDGEMM_L4x8_SUB2
 
-.LDGEMM_L4x8_SAVE:
+LDGEMM_L4x8_SAVE:
 
        SAVE4x8
 
-.LDGEMM_L4x8_END:
+LDGEMM_L4x8_END:
 
-.LDGEMM_L4x4_BEGIN:
+LDGEMM_L4x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LDGEMM_L4x4_END
+       ble             LDGEMM_L4x4_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L4x4_SUB0
+       ble             LDGEMM_L4x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L4x4_SUB4
+       ble             LDGEMM_L4x4_SUB4
 
-.LDGEMM_L4x4_LOOP_START:
+LDGEMM_L4x4_LOOP_START:
 
        LOAD4x4_1
        KERNEL4x4_I1
@@ -302,11 +296,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L4x4_LOOP_END
+       ble             LDGEMM_L4x4_LOOP_END
 
        .align 5
 
-.LDGEMM_L4x4_LOOP:
+LDGEMM_L4x4_LOOP:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -319,9 +313,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x4_LOOP
+       bgt             LDGEMM_L4x4_LOOP
 
-.LDGEMM_L4x4_LOOP_END:
+LDGEMM_L4x4_LOOP_END:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -333,9 +327,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_1
        KERNEL4x4_E2
 
-       b               .LDGEMM_L4x4_SUB1
+       b               LDGEMM_L4x4_SUB1
 
-.LDGEMM_L4x4_SUB4:
+LDGEMM_L4x4_SUB4:
 
        KERNEL4x4_SUBI1
        KERNEL4x4_SUB1
@@ -347,48 +341,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_SUB1
        KERNEL4x4_SUB1
 
-       b               .LDGEMM_L4x4_SUB1
+       b               LDGEMM_L4x4_SUB1
 
-.LDGEMM_L4x4_SUB0:
+LDGEMM_L4x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L4x4_SAVE
-       b               .LDGEMM_L4x4_SUB2
+       ble             LDGEMM_L4x4_SAVE
+       b               LDGEMM_L4x4_SUB2
 
-.LDGEMM_L4x4_SUB1:
+LDGEMM_L4x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L4x4_SAVE
+       ble             LDGEMM_L4x4_SAVE
 
-.LDGEMM_L4x4_SUB2:
+LDGEMM_L4x4_SUB2:
 
        KERNEL4x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x4_SUB2
+       bgt             LDGEMM_L4x4_SUB2
 
-.LDGEMM_L4x4_SAVE:
+LDGEMM_L4x4_SAVE:
 
        SAVE4x4
 
-.LDGEMM_L4x4_END:
+LDGEMM_L4x4_END:
 
-.LDGEMM_L4x2_BEGIN:
+LDGEMM_L4x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LDGEMM_L4x2_END
+       ble             LDGEMM_L4x2_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L4x2_SUB0
+       ble             LDGEMM_L4x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L4x2_SUB4
+       ble             LDGEMM_L4x2_SUB4
 
-.LDGEMM_L4x2_LOOP_START:
+LDGEMM_L4x2_LOOP_START:
 
        LOAD4x2_1
        KERNEL4x2_I1
@@ -402,11 +396,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L4x2_LOOP_END
+       ble             LDGEMM_L4x2_LOOP_END
 
        .align 5
 
-.LDGEMM_L4x2_LOOP:
+LDGEMM_L4x2_LOOP:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -419,9 +413,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x2_LOOP
+       bgt             LDGEMM_L4x2_LOOP
 
-.LDGEMM_L4x2_LOOP_END:
+LDGEMM_L4x2_LOOP_END:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -433,9 +427,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_1
        KERNEL4x2_E2
 
-       b               .LDGEMM_L4x2_SUB1
+       b               LDGEMM_L4x2_SUB1
 
-.LDGEMM_L4x2_SUB4:
+LDGEMM_L4x2_SUB4:
 
        KERNEL4x2_SUBI1
        KERNEL4x2_SUB1
@@ -447,48 +441,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB1
        KERNEL4x2_SUB1
 
-       b               .LDGEMM_L4x2_SUB1
+       b               LDGEMM_L4x2_SUB1
 
-.LDGEMM_L4x2_SUB0:
+LDGEMM_L4x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L4x2_SAVE
-       b               .LDGEMM_L4x2_SUB2
+       ble             LDGEMM_L4x2_SAVE
+       b               LDGEMM_L4x2_SUB2
 
-.LDGEMM_L4x2_SUB1:
+LDGEMM_L4x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L4x2_SAVE
+       ble             LDGEMM_L4x2_SAVE
 
-.LDGEMM_L4x2_SUB2:
+LDGEMM_L4x2_SUB2:
 
        KERNEL4x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x2_SUB2
+       bgt             LDGEMM_L4x2_SUB2
 
-.LDGEMM_L4x2_SAVE:
+LDGEMM_L4x2_SAVE:
 
        SAVE4x2
 
-.LDGEMM_L4x2_END:
+LDGEMM_L4x2_END:
 
-.LDGEMM_L4x1_BEGIN:
+LDGEMM_L4x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LDGEMM_L4x1_END
+       ble             LDGEMM_L4x1_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L4x1_SUB0
+       ble             LDGEMM_L4x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L4x1_SUB4
+       ble             LDGEMM_L4x1_SUB4
 
-.LDGEMM_L4x1_LOOP_START:
+LDGEMM_L4x1_LOOP_START:
 
        LOAD4x1_1
        KERNEL4x1_I1
@@ -502,11 +496,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L4x1_LOOP_END
+       ble             LDGEMM_L4x1_LOOP_END
 
        .align 5
 
-.LDGEMM_L4x1_LOOP:
+LDGEMM_L4x1_LOOP:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -519,9 +513,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x1_LOOP
+       bgt             LDGEMM_L4x1_LOOP
 
-.LDGEMM_L4x1_LOOP_END:
+LDGEMM_L4x1_LOOP_END:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -533,9 +527,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_1
        KERNEL4x1_E2
 
-       b               .LDGEMM_L4x1_SUB1
+       b               LDGEMM_L4x1_SUB1
 
-.LDGEMM_L4x1_SUB4:
+LDGEMM_L4x1_SUB4:
 
        KERNEL4x1_SUBI1
        KERNEL4x1_SUB1
@@ -547,74 +541,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_SUB1
        KERNEL4x1_SUB1
 
-       b               .LDGEMM_L4x1_SUB1
+       b               LDGEMM_L4x1_SUB1
 
-.LDGEMM_L4x1_SUB0:
+LDGEMM_L4x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L4x1_SAVE
-       b               .LDGEMM_L4x1_SUB2
+       ble             LDGEMM_L4x1_SAVE
+       b               LDGEMM_L4x1_SUB2
 
-.LDGEMM_L4x1_SUB1:
+LDGEMM_L4x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L4x1_SAVE
+       ble             LDGEMM_L4x1_SAVE
 
-.LDGEMM_L4x1_SUB2:
+LDGEMM_L4x1_SUB2:
 
        KERNEL4x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L4x1_SUB2
+       bgt             LDGEMM_L4x1_SUB2
 
-.LDGEMM_L4x1_SAVE:
+LDGEMM_L4x1_SAVE:
 
        SAVE4x1
 
-.LDGEMM_L4x1_END:
+LDGEMM_L4x1_END:
 
        slwi            T1,     K,      5
        add             B,      B,      T1
 
        addic.          J,      J,      -1
-       bgt             .LDGEMM_L4_BEGIN
+       bgt             LDGEMM_L4_BEGIN
 
        andi.           T2,     N,      3
        ble             .L999
 
-.LDGEMM_L4_END:
+LDGEMM_L4_END:
 
-       b               .LDGEMM_L2_BEGIN
+       b               LDGEMM_L2_BEGIN
 
 .L999_H1:
 
        b               .L999
 
-.LDGEMM_L2_BEGIN:
+LDGEMM_L2_BEGIN:
 
        andi.           T1,     N,      2
-       ble             .LDGEMM_L2_END
+       ble             LDGEMM_L2_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       1
        add             C,      C,      T1
        srawi.          I,      M,      4
-       ble             .LDGEMM_L2x16_END
+       ble             LDGEMM_L2x16_END
 
-.LDGEMM_L2x16_BEGIN:
+LDGEMM_L2x16_BEGIN:
 
 
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L2x16_SUB0
+       ble             LDGEMM_L2x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L2x16_SUB4
+       ble             LDGEMM_L2x16_SUB4
 
-.LDGEMM_L2x16_LOOP_START:
+LDGEMM_L2x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD2x16_1
@@ -637,11 +631,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L2x16_LOOP_END
+       ble             LDGEMM_L2x16_LOOP_END
 
        .align 5
 
-.LDGEMM_L2x16_LOOP:
+LDGEMM_L2x16_LOOP:
 
        dcbt            AO,     PRE
        KERNEL2x16_1
@@ -662,9 +656,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x16_LOOP
+       bgt             LDGEMM_L2x16_LOOP
 
-.LDGEMM_L2x16_LOOP_END:
+LDGEMM_L2x16_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL2x16_1
@@ -683,9 +677,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_1
        KERNEL2x16_E2
 
-       b               .LDGEMM_L2x16_SUB1
+       b               LDGEMM_L2x16_SUB1
 
-.LDGEMM_L2x16_SUB4:
+LDGEMM_L2x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL2x16_SUBI1
@@ -701,53 +695,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_SUB1
        KERNEL2x16_SUB1
 
-       b               .LDGEMM_L2x16_SUB1
+       b               LDGEMM_L2x16_SUB1
 
-.LDGEMM_L2x16_SUB0:
+LDGEMM_L2x16_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L2x16_SAVE
-       b               .LDGEMM_L2x16_SUB2
+       ble             LDGEMM_L2x16_SAVE
+       b               LDGEMM_L2x16_SUB2
 
-.LDGEMM_L2x16_SUB1:
+LDGEMM_L2x16_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L2x16_SAVE
+       ble             LDGEMM_L2x16_SAVE
 
-.LDGEMM_L2x16_SUB2:
+LDGEMM_L2x16_SUB2:
 
        KERNEL2x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x16_SUB2
+       bgt             LDGEMM_L2x16_SUB2
 
-.LDGEMM_L2x16_SAVE:
+LDGEMM_L2x16_SAVE:
 
        SAVE2x16
 
        addic.          I,      I,      -1
-       bgt             .LDGEMM_L2x16_BEGIN
+       bgt             LDGEMM_L2x16_BEGIN
 
-.LDGEMM_L2x16_END:
+LDGEMM_L2x16_END:
 
-.LDGEMM_L2x8_BEGIN:
+LDGEMM_L2x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LDGEMM_L2x1_END
+       ble             LDGEMM_L2x1_END
 
        andi.           T1,     M,      8
-       ble             .LDGEMM_L2x8_END
+       ble             LDGEMM_L2x8_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L2x8_SUB0
+       ble             LDGEMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L2x8_SUB4
+       ble             LDGEMM_L2x8_SUB4
 
-.LDGEMM_L2x8_LOOP_START:
+LDGEMM_L2x8_LOOP_START:
 
        LOAD2x8_1
        KERNEL2x8_I1
@@ -761,11 +755,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L2x8_LOOP_END
+       ble             LDGEMM_L2x8_LOOP_END
 
        .align 5
 
-.LDGEMM_L2x8_LOOP:
+LDGEMM_L2x8_LOOP:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -778,9 +772,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x8_LOOP
+       bgt             LDGEMM_L2x8_LOOP
 
-.LDGEMM_L2x8_LOOP_END:
+LDGEMM_L2x8_LOOP_END:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -792,9 +786,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_1
        KERNEL2x8_E2
 
-       b               .LDGEMM_L2x8_SUB1
+       b               LDGEMM_L2x8_SUB1
 
-.LDGEMM_L2x8_SUB4:
+LDGEMM_L2x8_SUB4:
 
        KERNEL2x8_SUBI1
        KERNEL2x8_SUB1
@@ -806,48 +800,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_SUB1
        KERNEL2x8_SUB1
 
-       b               .LDGEMM_L2x8_SUB1
+       b               LDGEMM_L2x8_SUB1
 
-.LDGEMM_L2x8_SUB0:
+LDGEMM_L2x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L2x8_SAVE
-       b               .LDGEMM_L2x8_SUB2
+       ble             LDGEMM_L2x8_SAVE
+       b               LDGEMM_L2x8_SUB2
 
-.LDGEMM_L2x8_SUB1:
+LDGEMM_L2x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L2x8_SAVE
+       ble             LDGEMM_L2x8_SAVE
 
-.LDGEMM_L2x8_SUB2:
+LDGEMM_L2x8_SUB2:
 
        KERNEL2x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x8_SUB2
+       bgt             LDGEMM_L2x8_SUB2
 
-.LDGEMM_L2x8_SAVE:
+LDGEMM_L2x8_SAVE:
 
        SAVE2x8
 
-.LDGEMM_L2x8_END:
+LDGEMM_L2x8_END:
 
-.LDGEMM_L2x4_BEGIN:
+LDGEMM_L2x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LDGEMM_L2x4_END
+       ble             LDGEMM_L2x4_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L2x4_SUB0
+       ble             LDGEMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L2x4_SUB4
+       ble             LDGEMM_L2x4_SUB4
 
-.LDGEMM_L2x4_LOOP_START:
+LDGEMM_L2x4_LOOP_START:
 
        LOAD2x4_1
        KERNEL2x4_I1
@@ -861,11 +855,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L2x4_LOOP_END
+       ble             LDGEMM_L2x4_LOOP_END
 
        .align 5
 
-.LDGEMM_L2x4_LOOP:
+LDGEMM_L2x4_LOOP:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -878,9 +872,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x4_LOOP
+       bgt             LDGEMM_L2x4_LOOP
 
-.LDGEMM_L2x4_LOOP_END:
+LDGEMM_L2x4_LOOP_END:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -892,9 +886,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_1
        KERNEL2x4_E2
 
-       b               .LDGEMM_L2x4_SUB1
+       b               LDGEMM_L2x4_SUB1
 
-.LDGEMM_L2x4_SUB4:
+LDGEMM_L2x4_SUB4:
 
        KERNEL2x4_SUBI1
        KERNEL2x4_SUB1
@@ -906,48 +900,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
 
-       b               .LDGEMM_L2x4_SUB1
+       b               LDGEMM_L2x4_SUB1
 
-.LDGEMM_L2x4_SUB0:
+LDGEMM_L2x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L2x4_SAVE
-       b               .LDGEMM_L2x4_SUB2
+       ble             LDGEMM_L2x4_SAVE
+       b               LDGEMM_L2x4_SUB2
 
-.LDGEMM_L2x4_SUB1:
+LDGEMM_L2x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L2x4_SAVE
+       ble             LDGEMM_L2x4_SAVE
 
-.LDGEMM_L2x4_SUB2:
+LDGEMM_L2x4_SUB2:
 
        KERNEL2x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x4_SUB2
+       bgt             LDGEMM_L2x4_SUB2
 
-.LDGEMM_L2x4_SAVE:
+LDGEMM_L2x4_SAVE:
 
        SAVE2x4
 
-.LDGEMM_L2x4_END:
+LDGEMM_L2x4_END:
 
-.LDGEMM_L2x2_BEGIN:
+LDGEMM_L2x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LDGEMM_L2x2_END
+       ble             LDGEMM_L2x2_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L2x2_SUB0
+       ble             LDGEMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L2x2_SUB4
+       ble             LDGEMM_L2x2_SUB4
 
-.LDGEMM_L2x2_LOOP_START:
+LDGEMM_L2x2_LOOP_START:
 
        LOAD2x2_1
        KERNEL2x2_I1
@@ -961,11 +955,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L2x2_LOOP_END
+       ble             LDGEMM_L2x2_LOOP_END
 
        .align 5
 
-.LDGEMM_L2x2_LOOP:
+LDGEMM_L2x2_LOOP:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -978,9 +972,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x2_LOOP
+       bgt             LDGEMM_L2x2_LOOP
 
-.LDGEMM_L2x2_LOOP_END:
+LDGEMM_L2x2_LOOP_END:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -992,9 +986,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_1
        KERNEL2x2_E2
 
-       b               .LDGEMM_L2x2_SUB1
+       b               LDGEMM_L2x2_SUB1
 
-.LDGEMM_L2x2_SUB4:
+LDGEMM_L2x2_SUB4:
 
        KERNEL2x2_SUBI1
        KERNEL2x2_SUB1
@@ -1006,48 +1000,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
 
-       b               .LDGEMM_L2x2_SUB1
+       b               LDGEMM_L2x2_SUB1
 
-.LDGEMM_L2x2_SUB0:
+LDGEMM_L2x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L2x2_SAVE
-       b               .LDGEMM_L2x2_SUB2
+       ble             LDGEMM_L2x2_SAVE
+       b               LDGEMM_L2x2_SUB2
 
-.LDGEMM_L2x2_SUB1:
+LDGEMM_L2x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L2x2_SAVE
+       ble             LDGEMM_L2x2_SAVE
 
-.LDGEMM_L2x2_SUB2:
+LDGEMM_L2x2_SUB2:
 
        KERNEL2x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x2_SUB2
+       bgt             LDGEMM_L2x2_SUB2
 
-.LDGEMM_L2x2_SAVE:
+LDGEMM_L2x2_SAVE:
 
        SAVE2x2
 
-.LDGEMM_L2x2_END:
+LDGEMM_L2x2_END:
 
-.LDGEMM_L2x1_BEGIN:
+LDGEMM_L2x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LDGEMM_L2x1_END
+       ble             LDGEMM_L2x1_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L2x1_SUB0
+       ble             LDGEMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L2x1_SUB4
+       ble             LDGEMM_L2x1_SUB4
 
-.LDGEMM_L2x1_LOOP_START:
+LDGEMM_L2x1_LOOP_START:
 
        LOAD2x1_1
        KERNEL2x1_I1
@@ -1061,11 +1055,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L2x1_LOOP_END
+       ble             LDGEMM_L2x1_LOOP_END
 
        .align 5
 
-.LDGEMM_L2x1_LOOP:
+LDGEMM_L2x1_LOOP:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -1078,9 +1072,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x1_LOOP
+       bgt             LDGEMM_L2x1_LOOP
 
-.LDGEMM_L2x1_LOOP_END:
+LDGEMM_L2x1_LOOP_END:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -1092,9 +1086,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_1
        KERNEL2x1_E2
 
-       b               .LDGEMM_L2x1_SUB1
+       b               LDGEMM_L2x1_SUB1
 
-.LDGEMM_L2x1_SUB4:
+LDGEMM_L2x1_SUB4:
 
        KERNEL2x1_SUBI1
        KERNEL2x1_SUB1
@@ -1106,59 +1100,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
 
-       b               .LDGEMM_L2x1_SUB1
+       b               LDGEMM_L2x1_SUB1
 
-.LDGEMM_L2x1_SUB0:
+LDGEMM_L2x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L2x1_SAVE
-       b               .LDGEMM_L2x1_SUB2
+       ble             LDGEMM_L2x1_SAVE
+       b               LDGEMM_L2x1_SUB2
 
-.LDGEMM_L2x1_SUB1:
+LDGEMM_L2x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L2x1_SAVE
+       ble             LDGEMM_L2x1_SAVE
 
-.LDGEMM_L2x1_SUB2:
+LDGEMM_L2x1_SUB2:
 
        KERNEL2x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L2x1_SUB2
+       bgt             LDGEMM_L2x1_SUB2
 
-.LDGEMM_L2x1_SAVE:
+LDGEMM_L2x1_SAVE:
 
        SAVE2x1
 
-.LDGEMM_L2x1_END:
+LDGEMM_L2x1_END:
 
        slwi            T1,     K,      4
        add             B,      B,      T1
 
-.LDGEMM_L2_END:
-.LDGEMM_L1_BEGIN:
+LDGEMM_L2_END:
+LDGEMM_L1_BEGIN:
 
        andi.           T1,     N,      1
-       ble             .LDGEMM_L1_END
+       ble             LDGEMM_L1_END
        mr              CO,     C
        mr              AO,     A
        srawi.          I,      M,      4
-       ble             .LDGEMM_L1x16_END
+       ble             LDGEMM_L1x16_END
 
-.LDGEMM_L1x16_BEGIN:
+LDGEMM_L1x16_BEGIN:
 
 
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L1x16_SUB0
+       ble             LDGEMM_L1x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L1x16_SUB4
+       ble             LDGEMM_L1x16_SUB4
 
-.LDGEMM_L1x16_LOOP_START:
+LDGEMM_L1x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD1x16_1
@@ -1181,11 +1175,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L1x16_LOOP_END
+       ble             LDGEMM_L1x16_LOOP_END
 
        .align 5
 
-.LDGEMM_L1x16_LOOP:
+LDGEMM_L1x16_LOOP:
 
        dcbt            AO,     PRE
        KERNEL1x16_1
@@ -1206,9 +1200,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x16_LOOP
+       bgt             LDGEMM_L1x16_LOOP
 
-.LDGEMM_L1x16_LOOP_END:
+LDGEMM_L1x16_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL1x16_1
@@ -1227,9 +1221,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_1
        KERNEL1x16_E2
 
-       b               .LDGEMM_L1x16_SUB1
+       b               LDGEMM_L1x16_SUB1
 
-.LDGEMM_L1x16_SUB4:
+LDGEMM_L1x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL1x16_SUBI1
@@ -1245,53 +1239,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_SUB1
        KERNEL1x16_SUB1
 
-       b               .LDGEMM_L1x16_SUB1
+       b               LDGEMM_L1x16_SUB1
 
-.LDGEMM_L1x16_SUB0:
+LDGEMM_L1x16_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L1x16_SAVE
-       b               .LDGEMM_L1x16_SUB2
+       ble             LDGEMM_L1x16_SAVE
+       b               LDGEMM_L1x16_SUB2
 
-.LDGEMM_L1x16_SUB1:
+LDGEMM_L1x16_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L1x16_SAVE
+       ble             LDGEMM_L1x16_SAVE
 
-.LDGEMM_L1x16_SUB2:
+LDGEMM_L1x16_SUB2:
 
        KERNEL1x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x16_SUB2
+       bgt             LDGEMM_L1x16_SUB2
 
-.LDGEMM_L1x16_SAVE:
+LDGEMM_L1x16_SAVE:
 
        SAVE1x16
 
        addic.          I,      I,      -1
-       bgt             .LDGEMM_L1x16_BEGIN
+       bgt             LDGEMM_L1x16_BEGIN
 
-.LDGEMM_L1x16_END:
+LDGEMM_L1x16_END:
 
-.LDGEMM_L1x8_BEGIN:
+LDGEMM_L1x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LDGEMM_L1x1_END
+       ble             LDGEMM_L1x1_END
 
        andi.           T1,     M,      8
-       ble             .LDGEMM_L1x8_END
+       ble             LDGEMM_L1x8_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L1x8_SUB0
+       ble             LDGEMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L1x8_SUB4
+       ble             LDGEMM_L1x8_SUB4
 
-.LDGEMM_L1x8_LOOP_START:
+LDGEMM_L1x8_LOOP_START:
 
        LOAD1x8_1
        KERNEL1x8_I1
@@ -1305,11 +1299,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L1x8_LOOP_END
+       ble             LDGEMM_L1x8_LOOP_END
 
        .align 5
 
-.LDGEMM_L1x8_LOOP:
+LDGEMM_L1x8_LOOP:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -1322,9 +1316,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x8_LOOP
+       bgt             LDGEMM_L1x8_LOOP
 
-.LDGEMM_L1x8_LOOP_END:
+LDGEMM_L1x8_LOOP_END:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -1336,9 +1330,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_1
        KERNEL1x8_E2
 
-       b               .LDGEMM_L1x8_SUB1
+       b               LDGEMM_L1x8_SUB1
 
-.LDGEMM_L1x8_SUB4:
+LDGEMM_L1x8_SUB4:
 
        KERNEL1x8_SUBI1
        KERNEL1x8_SUB1
@@ -1350,48 +1344,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_SUB1
        KERNEL1x8_SUB1
 
-       b               .LDGEMM_L1x8_SUB1
+       b               LDGEMM_L1x8_SUB1
 
-.LDGEMM_L1x8_SUB0:
+LDGEMM_L1x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L1x8_SAVE
-       b               .LDGEMM_L1x8_SUB2
+       ble             LDGEMM_L1x8_SAVE
+       b               LDGEMM_L1x8_SUB2
 
-.LDGEMM_L1x8_SUB1:
+LDGEMM_L1x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L1x8_SAVE
+       ble             LDGEMM_L1x8_SAVE
 
-.LDGEMM_L1x8_SUB2:
+LDGEMM_L1x8_SUB2:
 
        KERNEL1x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x8_SUB2
+       bgt             LDGEMM_L1x8_SUB2
 
-.LDGEMM_L1x8_SAVE:
+LDGEMM_L1x8_SAVE:
 
        SAVE1x8
 
-.LDGEMM_L1x8_END:
+LDGEMM_L1x8_END:
 
-.LDGEMM_L1x4_BEGIN:
+LDGEMM_L1x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LDGEMM_L1x4_END
+       ble             LDGEMM_L1x4_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L1x4_SUB0
+       ble             LDGEMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L1x4_SUB4
+       ble             LDGEMM_L1x4_SUB4
 
-.LDGEMM_L1x4_LOOP_START:
+LDGEMM_L1x4_LOOP_START:
 
        LOAD1x4_1
        KERNEL1x4_I1
@@ -1405,11 +1399,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L1x4_LOOP_END
+       ble             LDGEMM_L1x4_LOOP_END
 
        .align 5
 
-.LDGEMM_L1x4_LOOP:
+LDGEMM_L1x4_LOOP:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1422,9 +1416,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x4_LOOP
+       bgt             LDGEMM_L1x4_LOOP
 
-.LDGEMM_L1x4_LOOP_END:
+LDGEMM_L1x4_LOOP_END:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1436,9 +1430,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_1
        KERNEL1x4_E2
 
-       b               .LDGEMM_L1x4_SUB1
+       b               LDGEMM_L1x4_SUB1
 
-.LDGEMM_L1x4_SUB4:
+LDGEMM_L1x4_SUB4:
 
        KERNEL1x4_SUBI1
        KERNEL1x4_SUB1
@@ -1450,48 +1444,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
 
-       b               .LDGEMM_L1x4_SUB1
+       b               LDGEMM_L1x4_SUB1
 
-.LDGEMM_L1x4_SUB0:
+LDGEMM_L1x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L1x4_SAVE
-       b               .LDGEMM_L1x4_SUB2
+       ble             LDGEMM_L1x4_SAVE
+       b               LDGEMM_L1x4_SUB2
 
-.LDGEMM_L1x4_SUB1:
+LDGEMM_L1x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L1x4_SAVE
+       ble             LDGEMM_L1x4_SAVE
 
-.LDGEMM_L1x4_SUB2:
+LDGEMM_L1x4_SUB2:
 
        KERNEL1x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x4_SUB2
+       bgt             LDGEMM_L1x4_SUB2
 
-.LDGEMM_L1x4_SAVE:
+LDGEMM_L1x4_SAVE:
 
        SAVE1x4
 
-.LDGEMM_L1x4_END:
+LDGEMM_L1x4_END:
 
-.LDGEMM_L1x2_BEGIN:
+LDGEMM_L1x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LDGEMM_L1x2_END
+       ble             LDGEMM_L1x2_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L1x2_SUB0
+       ble             LDGEMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L1x2_SUB4
+       ble             LDGEMM_L1x2_SUB4
 
-.LDGEMM_L1x2_LOOP_START:
+LDGEMM_L1x2_LOOP_START:
 
        LOAD1x2_1
        KERNEL1x2_I1
@@ -1505,11 +1499,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L1x2_LOOP_END
+       ble             LDGEMM_L1x2_LOOP_END
 
        .align 5
 
-.LDGEMM_L1x2_LOOP:
+LDGEMM_L1x2_LOOP:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -1522,9 +1516,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x2_LOOP
+       bgt             LDGEMM_L1x2_LOOP
 
-.LDGEMM_L1x2_LOOP_END:
+LDGEMM_L1x2_LOOP_END:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -1536,9 +1530,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_1
        KERNEL1x2_E2
 
-       b               .LDGEMM_L1x2_SUB1
+       b               LDGEMM_L1x2_SUB1
 
-.LDGEMM_L1x2_SUB4:
+LDGEMM_L1x2_SUB4:
 
        KERNEL1x2_SUBI1
        KERNEL1x2_SUB1
@@ -1550,48 +1544,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
 
-       b               .LDGEMM_L1x2_SUB1
+       b               LDGEMM_L1x2_SUB1
 
-.LDGEMM_L1x2_SUB0:
+LDGEMM_L1x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L1x2_SAVE
-       b               .LDGEMM_L1x2_SUB2
+       ble             LDGEMM_L1x2_SAVE
+       b               LDGEMM_L1x2_SUB2
 
-.LDGEMM_L1x2_SUB1:
+LDGEMM_L1x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L1x2_SAVE
+       ble             LDGEMM_L1x2_SAVE
 
-.LDGEMM_L1x2_SUB2:
+LDGEMM_L1x2_SUB2:
 
        KERNEL1x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x2_SUB2
+       bgt             LDGEMM_L1x2_SUB2
 
-.LDGEMM_L1x2_SAVE:
+LDGEMM_L1x2_SAVE:
 
        SAVE1x2
 
-.LDGEMM_L1x2_END:
+LDGEMM_L1x2_END:
 
-.LDGEMM_L1x1_BEGIN:
+LDGEMM_L1x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LDGEMM_L1x1_END
+       ble             LDGEMM_L1x1_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LDGEMM_L1x1_SUB0
+       ble             LDGEMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LDGEMM_L1x1_SUB4
+       ble             LDGEMM_L1x1_SUB4
 
-.LDGEMM_L1x1_LOOP_START:
+LDGEMM_L1x1_LOOP_START:
 
        LOAD1x1_1
        KERNEL1x1_I1
@@ -1605,11 +1599,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -2
-       ble             .LDGEMM_L1x1_LOOP_END
+       ble             LDGEMM_L1x1_LOOP_END
 
        .align 5
 
-.LDGEMM_L1x1_LOOP:
+LDGEMM_L1x1_LOOP:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -1622,9 +1616,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x1_LOOP
+       bgt             LDGEMM_L1x1_LOOP
 
-.LDGEMM_L1x1_LOOP_END:
+LDGEMM_L1x1_LOOP_END:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -1636,9 +1630,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_1
        KERNEL1x1_E2
 
-       b               .LDGEMM_L1x1_SUB1
+       b               LDGEMM_L1x1_SUB1
 
-.LDGEMM_L1x1_SUB4:
+LDGEMM_L1x1_SUB4:
 
        KERNEL1x1_SUBI1
        KERNEL1x1_SUB1
@@ -1650,34 +1644,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
 
-       b               .LDGEMM_L1x1_SUB1
+       b               LDGEMM_L1x1_SUB1
 
-.LDGEMM_L1x1_SUB0:
+LDGEMM_L1x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LDGEMM_L1x1_SAVE
-       b               .LDGEMM_L1x1_SUB2
+       ble             LDGEMM_L1x1_SAVE
+       b               LDGEMM_L1x1_SUB2
 
-.LDGEMM_L1x1_SUB1:
+LDGEMM_L1x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LDGEMM_L1x1_SAVE
+       ble             LDGEMM_L1x1_SAVE
 
-.LDGEMM_L1x1_SUB2:
+LDGEMM_L1x1_SUB2:
 
        KERNEL1x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LDGEMM_L1x1_SUB2
+       bgt             LDGEMM_L1x1_SUB2
 
-.LDGEMM_L1x1_SAVE:
+LDGEMM_L1x1_SAVE:
 
        SAVE1x1
 
-.LDGEMM_L1x1_END:
+LDGEMM_L1x1_END:
 
-.LDGEMM_L1_END:
+LDGEMM_L1_END:
index 27c05e0..36531fb 100644 (file)
@@ -431,6 +431,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        mr              T1,     CO
        addi            T2,     T1,     64
+       add             T3,     T1,     LDC
+       addi            T4,     T3,     64
 
 #ifndef TRMMKERNEL
        lxvd2x          vs0,    0,      T1
@@ -442,6 +444,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        lxvd2x          vs5,    o16,    T2
        lxvd2x          vs6,    o32,    T2
        lxvd2x          vs7,    o48,    T2
+
+       lxvd2x          vs8,    0,      T3
+       lxvd2x          vs9,    o16,    T3
+       lxvd2x          vs10,   o32,    T3
+       lxvd2x          vs11,   o48,    T3
+
+       lxvd2x          vs12,   0,      T4
+       lxvd2x          vs13,   o16,    T4
+       lxvd2x          vs14,   o32,    T4
+       lxvd2x          vs15,   o48,    T4
 #endif
 
 #ifndef TRMMKERNEL
@@ -453,6 +465,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmaddadp       vs5,    vs37,   alpha_r
        xvmaddadp       vs6,    vs38,   alpha_r
        xvmaddadp       vs7,    vs39,   alpha_r
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+       xvmaddadp       vs12,   vs44,   alpha_r
+       xvmaddadp       vs13,   vs45,   alpha_r
+       xvmaddadp       vs14,   vs46,   alpha_r
+       xvmaddadp       vs15,   vs47,   alpha_r
 #else
        xvmuldp         vs0,    vs32,   alpha_r
        xvmuldp         vs1,    vs33,   alpha_r
@@ -462,6 +482,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmuldp         vs5,    vs37,   alpha_r
        xvmuldp         vs6,    vs38,   alpha_r
        xvmuldp         vs7,    vs39,   alpha_r
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+       xvmuldp         vs12,   vs44,   alpha_r
+       xvmuldp         vs13,   vs45,   alpha_r
+       xvmuldp         vs14,   vs46,   alpha_r
+       xvmuldp         vs15,   vs47,   alpha_r
 #endif
 
        stxvd2x         vs0,    0,      T1
@@ -469,62 +497,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        stxvd2x         vs2,    o32,    T1
        stxvd2x         vs3,    o48,    T1
 
-       dcbt            T1, PRE
-
        stxvd2x         vs4,    0,      T2
        stxvd2x         vs5,    o16,    T2
        stxvd2x         vs6,    o32,    T2
        stxvd2x         vs7,    o48,    T2
 
-       add             T1,     T1,     LDC
-       add             T2,     T2,     LDC
-
-#ifndef TRMMKERNEL
-       lxvd2x          vs8,    0,      T1
-       lxvd2x          vs9,    o16,    T1
-       lxvd2x          vs10,   o32,    T1
-       lxvd2x          vs11,   o48,    T1
+       stxvd2x         vs8,    0,      T3
+       stxvd2x         vs9,    o16,    T3
+       stxvd2x         vs10,   o32,    T3
+       stxvd2x         vs11,   o48,    T3
 
-       lxvd2x          vs12,   0,      T2
-       lxvd2x          vs13,   o16,    T2
-       lxvd2x          vs14,   o32,    T2
-       lxvd2x          vs15,   o48,    T2
-#endif
+       stxvd2x         vs12,   0,      T4
+       stxvd2x         vs13,   o16,    T4
+       stxvd2x         vs14,   o32,    T4
+       stxvd2x         vs15,   o48,    T4
 
-#ifndef TRMMKERNEL
-       xvmaddadp       vs8,    vs40,   alpha_r
-       xvmaddadp       vs9,    vs41,   alpha_r
-       xvmaddadp       vs10,   vs42,   alpha_r
-       xvmaddadp       vs11,   vs43,   alpha_r
-       xvmaddadp       vs12,   vs44,   alpha_r
-       xvmaddadp       vs13,   vs45,   alpha_r
-       xvmaddadp       vs14,   vs46,   alpha_r
-       xvmaddadp       vs15,   vs47,   alpha_r
-#else
-       xvmuldp         vs8,    vs40,   alpha_r
-       xvmuldp         vs9,    vs41,   alpha_r
-       xvmuldp         vs10,   vs42,   alpha_r
-       xvmuldp         vs11,   vs43,   alpha_r
-       xvmuldp         vs12,   vs44,   alpha_r
-       xvmuldp         vs13,   vs45,   alpha_r
-       xvmuldp         vs14,   vs46,   alpha_r
-       xvmuldp         vs15,   vs47,   alpha_r
-#endif
-
-       stxvd2x         vs8,    0,      T1
-       stxvd2x         vs9,    o16,    T1
-       stxvd2x         vs10,   o32,    T1
-       stxvd2x         vs11,   o48,    T1
-
-       dcbt            T1, PRE
-
-       stxvd2x         vs12,   0,      T2
-       stxvd2x         vs13,   o16,    T2
-       stxvd2x         vs14,   o32,    T2
-       stxvd2x         vs15,   o48,    T2
-
-       add             T1,     T1,     LDC
-       add             T2,     T2,     LDC
+       slwi            T4,     LDC,    1
+       add             T1,     T1,     T4
+       add             T3,     T3,     T4
+       addi            T2,     T1,     64
+       addi            T4,     T3,     64
 
 #ifndef TRMMKERNEL
        lxvd2x          vs0,    0,      T1
@@ -536,6 +528,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        lxvd2x          vs5,    o16,    T2
        lxvd2x          vs6,    o32,    T2
        lxvd2x          vs7,    o48,    T2
+
+       lxvd2x          vs8,    0,      T3
+       lxvd2x          vs9,    o16,    T3
+       lxvd2x          vs10,   o32,    T3
+       lxvd2x          vs11,   o48,    T3
+
+       lxvd2x          vs12,   0,      T4
+       lxvd2x          vs13,   o16,    T4
+       lxvd2x          vs14,   o32,    T4
+       lxvd2x          vs15,   o48,    T4
 #endif
 
 #ifndef TRMMKERNEL
@@ -547,6 +549,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmaddadp       vs5,    vs53,   alpha_r
        xvmaddadp       vs6,    vs54,   alpha_r
        xvmaddadp       vs7,    vs55,   alpha_r
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+       xvmaddadp       vs10,   vs58,   alpha_r
+       xvmaddadp       vs11,   vs59,   alpha_r
+       xvmaddadp       vs12,   vs60,   alpha_r
+       xvmaddadp       vs13,   vs61,   alpha_r
+       xvmaddadp       vs14,   vs62,   alpha_r
+       xvmaddadp       vs15,   vs63,   alpha_r
 #else
        xvmuldp         vs0,    vs48,   alpha_r
        xvmuldp         vs1,    vs49,   alpha_r
@@ -556,6 +566,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmuldp         vs5,    vs53,   alpha_r
        xvmuldp         vs6,    vs54,   alpha_r
        xvmuldp         vs7,    vs55,   alpha_r
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+       xvmuldp         vs10,   vs58,   alpha_r
+       xvmuldp         vs11,   vs59,   alpha_r
+       xvmuldp         vs12,   vs60,   alpha_r
+       xvmuldp         vs13,   vs61,   alpha_r
+       xvmuldp         vs14,   vs62,   alpha_r
+       xvmuldp         vs15,   vs63,   alpha_r
 #endif
 
        stxvd2x         vs0,    0,      T1
@@ -563,59 +581,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        stxvd2x         vs2,    o32,    T1
        stxvd2x         vs3,    o48,    T1
 
-       dcbt            T1, PRE
-
        stxvd2x         vs4,    0,      T2
        stxvd2x         vs5,    o16,    T2
        stxvd2x         vs6,    o32,    T2
        stxvd2x         vs7,    o48,    T2
 
-       add             T1,     T1,     LDC
-       add             T2,     T2,     LDC
-
-#ifndef TRMMKERNEL
-       lxvd2x          vs8,    0,      T1
-       lxvd2x          vs9,    o16,    T1
-       lxvd2x          vs10,   o32,    T1
-       lxvd2x          vs11,   o48,    T1
-
-       lxvd2x          vs12,   0,      T2
-       lxvd2x          vs13,   o16,    T2
-       lxvd2x          vs14,   o32,    T2
-       lxvd2x          vs15,   o48,    T2
-#endif
-
-#ifndef TRMMKERNEL
-       xvmaddadp       vs8,    vs56,   alpha_r
-       xvmaddadp       vs9,    vs57,   alpha_r
-       xvmaddadp       vs10,   vs58,   alpha_r
-       xvmaddadp       vs11,   vs59,   alpha_r
-       xvmaddadp       vs12,   vs60,   alpha_r
-       xvmaddadp       vs13,   vs61,   alpha_r
-       xvmaddadp       vs14,   vs62,   alpha_r
-       xvmaddadp       vs15,   vs63,   alpha_r
-#else
-       xvmuldp         vs8,    vs56,   alpha_r
-       xvmuldp         vs9,    vs57,   alpha_r
-       xvmuldp         vs10,   vs58,   alpha_r
-       xvmuldp         vs11,   vs59,   alpha_r
-       xvmuldp         vs12,   vs60,   alpha_r
-       xvmuldp         vs13,   vs61,   alpha_r
-       xvmuldp         vs14,   vs62,   alpha_r
-       xvmuldp         vs15,   vs63,   alpha_r
-#endif
-
-       stxvd2x         vs8,    0,      T1
-       stxvd2x         vs9,    o16,    T1
-       stxvd2x         vs10,   o32,    T1
-       stxvd2x         vs11,   o48,    T1
-
-       dcbt            T1, PRE
+       stxvd2x         vs8,    0,      T3
+       stxvd2x         vs9,    o16,    T3
+       stxvd2x         vs10,   o32,    T3
+       stxvd2x         vs11,   o48,    T3
 
-       stxvd2x         vs12,   0,      T2
-       stxvd2x         vs13,   o16,    T2
-       stxvd2x         vs14,   o32,    T2
-       stxvd2x         vs15,   o48,    T2
+       stxvd2x         vs12,   0,      T4
+       stxvd2x         vs13,   o16,    T4
+       stxvd2x         vs14,   o32,    T4
+       stxvd2x         vs15,   o48,    T4
 
        addi            CO,     CO,     128
 
index f87af53..eca78ba 100644 (file)
@@ -170,7 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     B2, B2, B
        add     B1, B1, B
 
-       li      PREA,  768
+       li      PREA,  256
        addi    PREB,  M16, 128
 
        li      o8,     8
index 776cd34..28fc747 100644 (file)
@@ -57,16 +57,20 @@ DCOPYT_L4_BEGIN:
 
 DCOPYT_L4x16_LOOP:
 
+/*
        addi    T1,     PREB,   128
        addi    T2,     PREB,   256
+*/
        dcbt    A0,     PREA
        dcbt    A1,     PREA
        dcbt    A2,     PREA
        dcbt    A3,     PREA
+/*
        dcbtst  BO,     M16
        dcbtst  BO,     PREB
        dcbtst  BO,     T1
        dcbtst  BO,     T2
+*/
        COPY_4x16
 
        add             BO,     BO,     M16
index 2294128..e9dbd99 100644 (file)
@@ -152,7 +152,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define PRE    r30
 #define T2     r31
 
-#include "dgemm_macros_16x4_power8.S"
+#include "dtrmm_macros_16x4_power8.S"
 
 
 #ifndef NEEDPARAM
diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S
new file mode 100644 (file)
index 0000000..079144a
--- /dev/null
@@ -0,0 +1,3431 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+       addi            AO, AO, 64
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+
+       xvmuldp                 vs52,   vs4,    vs26
+       xvmuldp                 vs53,   vs5,    vs26
+       xvmuldp                 vs54,   vs6,    vs26
+       xvmuldp                 vs55,   vs7,    vs26
+
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmuldp                 vs60,   vs4,    vs27
+       xvmuldp                 vs61,   vs5,    vs27
+       xvmuldp                 vs62,   vs6,    vs27
+       xvmuldp                 vs63,   vs7,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+       addi            AO, AO, 64
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+
+       xvmaddadp               vs52,   vs4,    vs26
+       xvmaddadp               vs53,   vs5,    vs26
+       xvmaddadp               vs54,   vs6,    vs26
+       xvmaddadp               vs55,   vs7,    vs26
+
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmaddadp               vs60,   vs4,    vs27
+       xvmaddadp               vs61,   vs5,    vs27
+       xvmaddadp               vs62,   vs6,    vs27
+       xvmaddadp               vs63,   vs7,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+       addi            AO, AO, 64
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+
+       xvmaddadp               vs52,   vs12,   vs30
+       xvmaddadp               vs53,   vs13,   vs30
+       xvmaddadp               vs54,   vs14,   vs30
+       xvmaddadp               vs55,   vs15,   vs30
+
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       xvmaddadp               vs60,   vs12,   vs31
+       xvmaddadp               vs61,   vs13,   vs31
+       xvmaddadp               vs62,   vs14,   vs31
+       xvmaddadp               vs63,   vs15,   vs31
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+       xvmaddadp               vs52,   vs12,   vs30
+       xvmaddadp               vs53,   vs13,   vs30
+       xvmaddadp               vs54,   vs14,   vs30
+       xvmaddadp               vs55,   vs15,   vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+       xvmaddadp               vs60,   vs12,   vs31
+       xvmaddadp               vs61,   vs13,   vs31
+       xvmaddadp               vs62,   vs14,   vs31
+       xvmaddadp               vs63,   vs15,   vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+       xvmuldp                 vs52,   vs4,    vs26
+       xvmuldp                 vs53,   vs5,    vs26
+       xvmuldp                 vs54,   vs6,    vs26
+       xvmuldp                 vs55,   vs7,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+       xvmuldp                 vs60,   vs4,    vs27
+       xvmuldp                 vs61,   vs5,    vs27
+       xvmuldp                 vs62,   vs6,    vs27
+       xvmuldp                 vs63,   vs7,    vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+       xvmaddadp               vs52,   vs4,    vs26
+       xvmaddadp               vs53,   vs5,    vs26
+       xvmaddadp               vs54,   vs6,    vs26
+       xvmaddadp               vs55,   vs7,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+       xvmaddadp               vs60,   vs4,    vs27
+       xvmaddadp               vs61,   vs5,    vs27
+       xvmaddadp               vs62,   vs6,    vs27
+       xvmaddadp               vs63,   vs7,    vs27
+
+.endm
+
+.macro SAVE4x16
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+       xvmaddadp       vs4,    vs36,   alpha_r
+       xvmaddadp       vs5,    vs37,   alpha_r
+       xvmaddadp       vs6,    vs38,   alpha_r
+       xvmaddadp       vs7,    vs39,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+       xvmuldp         vs4,    vs36,   alpha_r
+       xvmuldp         vs5,    vs37,   alpha_r
+       xvmuldp         vs6,    vs38,   alpha_r
+       xvmuldp         vs7,    vs39,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+
+       lxvd2x          vs12,   0,      T2
+       lxvd2x          vs13,   o16,    T2
+       lxvd2x          vs14,   o32,    T2
+       lxvd2x          vs15,   o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+       xvmaddadp       vs12,   vs44,   alpha_r
+       xvmaddadp       vs13,   vs45,   alpha_r
+       xvmaddadp       vs14,   vs46,   alpha_r
+       xvmaddadp       vs15,   vs47,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+       xvmuldp         vs12,   vs44,   alpha_r
+       xvmuldp         vs13,   vs45,   alpha_r
+       xvmuldp         vs14,   vs46,   alpha_r
+       xvmuldp         vs15,   vs47,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+
+       stxvd2x         vs12,   0,      T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+       xvmaddadp       vs1,    vs49,   alpha_r
+       xvmaddadp       vs2,    vs50,   alpha_r
+       xvmaddadp       vs3,    vs51,   alpha_r
+       xvmaddadp       vs4,    vs52,   alpha_r
+       xvmaddadp       vs5,    vs53,   alpha_r
+       xvmaddadp       vs6,    vs54,   alpha_r
+       xvmaddadp       vs7,    vs55,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+       xvmuldp         vs1,    vs49,   alpha_r
+       xvmuldp         vs2,    vs50,   alpha_r
+       xvmuldp         vs3,    vs51,   alpha_r
+       xvmuldp         vs4,    vs52,   alpha_r
+       xvmuldp         vs5,    vs53,   alpha_r
+       xvmuldp         vs6,    vs54,   alpha_r
+       xvmuldp         vs7,    vs55,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+
+       lxvd2x          vs12,   0,      T2
+       lxvd2x          vs13,   o16,    T2
+       lxvd2x          vs14,   o32,    T2
+       lxvd2x          vs15,   o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+       xvmaddadp       vs10,   vs58,   alpha_r
+       xvmaddadp       vs11,   vs59,   alpha_r
+       xvmaddadp       vs12,   vs60,   alpha_r
+       xvmaddadp       vs13,   vs61,   alpha_r
+       xvmaddadp       vs14,   vs62,   alpha_r
+       xvmaddadp       vs15,   vs63,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+       xvmuldp         vs10,   vs58,   alpha_r
+       xvmuldp         vs11,   vs59,   alpha_r
+       xvmuldp         vs12,   vs60,   alpha_r
+       xvmuldp         vs13,   vs61,   alpha_r
+       xvmuldp         vs14,   vs62,   alpha_r
+       xvmuldp         vs15,   vs63,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+
+       stxvd2x         vs12,   0,      T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       addi            CO,     CO,     128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+       xvmaddadp               vs50,   vs10,   vs30
+       xvmaddadp               vs51,   vs11,   vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+       xvmaddadp               vs58,   vs10,   vs31
+       xvmaddadp               vs59,   vs11,   vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+       xvmuldp                 vs50,   vs2,    vs26
+       xvmuldp                 vs51,   vs3,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+       xvmuldp                 vs58,   vs2,    vs27
+       xvmuldp                 vs59,   vs3,    vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+       xvmaddadp               vs50,   vs2,    vs26
+       xvmaddadp               vs51,   vs3,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+       xvmaddadp               vs58,   vs2,    vs27
+       xvmaddadp               vs59,   vs3,    vs27
+
+.endm
+
+.macro SAVE4x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+       xvmaddadp       vs1,    vs49,   alpha_r
+       xvmaddadp       vs2,    vs50,   alpha_r
+       xvmaddadp       vs3,    vs51,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+       xvmuldp         vs1,    vs49,   alpha_r
+       xvmuldp         vs2,    vs50,   alpha_r
+       xvmuldp         vs3,    vs51,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+       xvmaddadp       vs10,   vs58,   alpha_r
+       xvmaddadp       vs11,   vs59,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+       xvmuldp         vs10,   vs58,   alpha_r
+       xvmuldp         vs11,   vs59,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       addi            CO,     CO,     64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+       xvmaddadp               vs49,   vs9,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+       xvmaddadp               vs57,   vs9,    vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+       xvmuldp                 vs49,   vs1,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+       xvmuldp                 vs57,   vs1,    vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+       xvmaddadp               vs49,   vs1,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+       xvmaddadp               vs57,   vs1,    vs27
+
+.endm
+
+.macro SAVE4x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+       xvmaddadp       vs1,    vs49,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+       xvmuldp         vs1,    vs49,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+       xvmaddadp       vs9,    vs57,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+       xvmuldp         vs9,    vs57,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+
+       addi            CO,     CO,     32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+       lxvdsx  vs30,   o16,    BO
+       lxvdsx  vs31,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+       xvmaddadp               vs48,   vs8,    vs30
+
+       xvmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+       xvmuldp                 vs48,   vs0,    vs26
+
+       xvmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+       lxvdsx  vs26,   o16,    BO
+       lxvdsx  vs27,   o24,    BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 32
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+       xvmaddadp               vs48,   vs0,    vs26
+
+       xvmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro SAVE4x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs48,   alpha_r
+#else
+       xvmuldp         vs0,    vs48,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs56,   alpha_r
+#else
+       xvmuldp         vs8,    vs56,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+
+       addi            CO,     CO,     16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+       lxsdx   vs30,   o16,    BO
+       lxsdx   vs31,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+       xsmuldp                 vs48,   vs0,    vs26
+
+       xsmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+       lxsdx   vs30,   o16,    BO
+       lxsdx   vs31,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+       xsmaddadp               vs48,   vs0,    vs26
+
+       xsmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+       xsmaddadp               vs48,   vs8,    vs30
+
+       xsmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+       xsmaddadp               vs48,   vs8,    vs30
+
+       xsmaddadp               vs56,   vs8,    vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+       xsmuldp                 vs48,   vs0,    vs26
+
+       xsmuldp                 vs56,   vs0,    vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+       lxsdx   vs26,   o16,    BO
+       lxsdx   vs27,   o24,    BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 32
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+       xsmaddadp               vs48,   vs0,    vs26
+
+       xsmaddadp               vs56,   vs0,    vs27
+
+.endm
+
+.macro SAVE4x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs32,   alpha_r
+#else
+       xsmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs8,    vs40,   alpha_r
+#else
+       xsmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxsdx          vs8,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs48,   alpha_r
+#else
+       xsmuldp         vs0,    vs48,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs8,    vs56,   alpha_r
+#else
+       xsmuldp         vs8,    vs56,   alpha_r
+#endif
+
+       stxsdx          vs8,    0,      T1
+
+       addi            CO,     CO,     8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+       xvmaddadp               vs44,   vs12,   vs29
+       xvmaddadp               vs45,   vs13,   vs29
+       xvmaddadp               vs46,   vs14,   vs29
+       xvmaddadp               vs47,   vs15,   vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+       xvmuldp                 vs44,   vs4,    vs25
+       xvmuldp                 vs45,   vs5,    vs25
+       xvmuldp                 vs46,   vs6,    vs25
+       xvmuldp                 vs47,   vs7,    vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+       xvmaddadp               vs44,   vs4,    vs25
+       xvmaddadp               vs45,   vs5,    vs25
+       xvmaddadp               vs46,   vs6,    vs25
+       xvmaddadp               vs47,   vs7,    vs25
+
+.endm
+
+.macro SAVE2x16
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+       xvmaddadp       vs4,    vs36,   alpha_r
+       xvmaddadp       vs5,    vs37,   alpha_r
+       xvmaddadp       vs6,    vs38,   alpha_r
+       xvmaddadp       vs7,    vs39,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+       xvmuldp         vs4,    vs36,   alpha_r
+       xvmuldp         vs5,    vs37,   alpha_r
+       xvmuldp         vs6,    vs38,   alpha_r
+       xvmuldp         vs7,    vs39,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+
+       lxvd2x          vs12,   0,      T2
+       lxvd2x          vs13,   o16,    T2
+       lxvd2x          vs14,   o32,    T2
+       lxvd2x          vs15,   o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+       xvmaddadp       vs12,   vs44,   alpha_r
+       xvmaddadp       vs13,   vs45,   alpha_r
+       xvmaddadp       vs14,   vs46,   alpha_r
+       xvmaddadp       vs15,   vs47,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+       xvmuldp         vs12,   vs44,   alpha_r
+       xvmuldp         vs13,   vs45,   alpha_r
+       xvmuldp         vs14,   vs46,   alpha_r
+       xvmuldp         vs15,   vs47,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       stxvd2x         vs12,   0,      T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       addi            CO,     CO,     128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+       xvmaddadp               vs42,   vs10,   vs29
+       xvmaddadp               vs43,   vs11,   vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+       xvmuldp                 vs42,   vs2,    vs25
+       xvmuldp                 vs43,   vs3,    vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+       xvmaddadp               vs42,   vs2,    vs25
+       xvmaddadp               vs43,   vs3,    vs25
+
+.endm
+
+.macro SAVE2x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+       lxvd2x          vs10,   o32,    T1
+       lxvd2x          vs11,   o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+       xvmaddadp       vs10,   vs42,   alpha_r
+       xvmaddadp       vs11,   vs43,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+       xvmuldp         vs10,   vs42,   alpha_r
+       xvmuldp         vs11,   vs43,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       addi            CO,     CO,     64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+       xvmaddadp               vs41,   vs9,    vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+       xvmuldp                 vs41,   vs1,    vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+       xvmaddadp               vs41,   vs1,    vs25
+
+.endm
+
+.macro SAVE2x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+       lxvd2x          vs9,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+       xvmaddadp       vs9,    vs41,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+       xvmuldp         vs9,    vs41,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+       stxvd2x         vs9,    o16,    T1
+
+       addi            CO,     CO,     32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+       lxvdsx  vs29,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+       xvmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+       xvmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+       lxvdsx  vs25,   o8,     BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 16
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+       xvmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro SAVE2x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs8,    vs40,   alpha_r
+#else
+       xvmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxvd2x         vs8,    0,      T1
+
+       addi            CO,     CO,     16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+       lxsdx   vs29,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+       xsmaddadp               vs40,   vs8,    vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+       xsmuldp                 vs40,   vs0,    vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+       lxsdx   vs25,   o8,     BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 16
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+       xsmaddadp               vs40,   vs0,    vs25
+
+.endm
+
+.macro SAVE2x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs32,   alpha_r
+#else
+       xsmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+       lxsdx           vs8,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs8,    vs40,   alpha_r
+#else
+       xsmuldp         vs8,    vs40,   alpha_r
+#endif
+
+       stxsdx          vs8,    0,      T1
+
+       addi            CO,     CO,     8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs12,   0,      AO
+       lxvd2x  vs13,   o16,    AO
+       lxvd2x  vs14,   o32,    AO
+       lxvd2x  vs15,   o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+       xvmaddadp               vs36,   vs12,   vs28
+       xvmaddadp               vs37,   vs13,   vs28
+       xvmaddadp               vs38,   vs14,   vs28
+       xvmaddadp               vs39,   vs15,   vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+       xvmuldp                 vs36,   vs4,    vs24
+       xvmuldp                 vs37,   vs5,    vs24
+       xvmuldp                 vs38,   vs6,    vs24
+       xvmuldp                 vs39,   vs7,    vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+       lxvd2x  vs4,    0,      AO
+       lxvd2x  vs5,    o16,    AO
+       lxvd2x  vs6,    o32,    AO
+       lxvd2x  vs7,    o48,    AO
+
+       addi            AO, AO, 64
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+       xvmaddadp               vs36,   vs4,    vs24
+       xvmaddadp               vs37,   vs5,    vs24
+       xvmaddadp               vs38,   vs6,    vs24
+       xvmaddadp               vs39,   vs7,    vs24
+
+.endm
+
+.macro SAVE1x16
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+
+       lxvd2x          vs4,    0,      T2
+       lxvd2x          vs5,    o16,    T2
+       lxvd2x          vs6,    o32,    T2
+       lxvd2x          vs7,    o48,    T2
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+       xvmaddadp       vs4,    vs36,   alpha_r
+       xvmaddadp       vs5,    vs37,   alpha_r
+       xvmaddadp       vs6,    vs38,   alpha_r
+       xvmaddadp       vs7,    vs39,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+       xvmuldp         vs4,    vs36,   alpha_r
+       xvmuldp         vs5,    vs37,   alpha_r
+       xvmuldp         vs6,    vs38,   alpha_r
+       xvmuldp         vs7,    vs39,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       stxvd2x         vs4,    0,      T2
+       stxvd2x         vs5,    o16,    T2
+       stxvd2x         vs6,    o32,    T2
+       stxvd2x         vs7,    o48,    T2
+
+       addi            CO,     CO,     128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+       lxvd2x  vs10,   o32,    AO
+       lxvd2x  vs11,   o48,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+       xvmaddadp               vs34,   vs10,   vs28
+       xvmaddadp               vs35,   vs11,   vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+       xvmuldp                 vs34,   vs2,    vs24
+       xvmuldp                 vs35,   vs3,    vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+       lxvd2x  vs2,    o32,    AO
+       lxvd2x  vs3,    o48,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 64
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+       xvmaddadp               vs34,   vs2,    vs24
+       xvmaddadp               vs35,   vs3,    vs24
+
+.endm
+
+.macro SAVE1x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+       lxvd2x          vs2,    o32,    T1
+       lxvd2x          vs3,    o48,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+       xvmaddadp       vs2,    vs34,   alpha_r
+       xvmaddadp       vs3,    vs35,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+       xvmuldp         vs2,    vs34,   alpha_r
+       xvmuldp         vs3,    vs35,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+       stxvd2x         vs2,    o32,    T1
+       stxvd2x         vs3,    o48,    T1
+
+       addi            CO,     CO,     64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+       lxvd2x  vs8,    0,      AO
+       lxvd2x  vs9,    o16,    AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+       xvmaddadp               vs33,   vs9,    vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+       xvmuldp                 vs33,   vs1,    vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+       lxvd2x  vs0,    0,      AO
+       lxvd2x  vs1,    o16,    AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 32
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+       xvmaddadp               vs33,   vs1,    vs24
+
+.endm
+
+.macro SAVE1x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+       lxvd2x          vs1,    o16,    T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+       xvmaddadp       vs1,    vs33,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+       xvmuldp         vs1,    vs33,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+       stxvd2x         vs1,    o16,    T1
+
+       addi            CO,     CO,     32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+       lxvd2x  vs8,    0,      AO
+
+       lxvdsx  vs28,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xvmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+       lxvd2x  vs0,    0,      AO
+
+       lxvdsx  vs24,   0,      BO
+
+       addi            AO, AO, 16
+       addi            BO, BO, 8
+
+
+       xvmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro SAVE1x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxvd2x          vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xvmaddadp       vs0,    vs32,   alpha_r
+#else
+       xvmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxvd2x         vs0,    0,      T1
+
+       addi            CO,     CO,     16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+       lxsdx   vs8,    0,      AO
+
+       lxsdx   vs28,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xsmaddadp               vs32,   vs8,    vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmuldp                 vs32,   vs0,    vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+       lxsdx   vs0,    0,      AO
+
+       lxsdx   vs24,   0,      BO
+
+       addi            AO, AO, 8
+       addi            BO, BO, 8
+
+
+       xsmaddadp               vs32,   vs0,    vs24
+
+.endm
+
+.macro SAVE1x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+       lxsdx           vs0,    0,      T1
+#endif
+
+#ifndef TRMMKERNEL
+       xsmaddadp       vs0,    vs32,   alpha_r
+#else
+       xsmuldp         vs0,    vs32,   alpha_r
+#endif
+
+       stxsdx          vs0,    0,      T1
+
+       addi            CO,     CO,     8
+
+.endm
+
diff --git a/param.h b/param.h
index aa09f6d..e693755 100644 (file)
--- a/param.h
+++ b/param.h
@@ -410,7 +410,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(STEAMROLLER) || defined(EXCAVATOR)
+#ifdef STEAMROLLER
+#define SNUMOPT         8
+#define DNUMOPT         4
+
+#define GEMM_DEFAULT_OFFSET_A  64
+#define GEMM_DEFAULT_OFFSET_B 832
+#define GEMM_DEFAULT_ALIGN 0x0fffUL
+
+
+
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#ifdef ARCH_X86
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+#else
+#define SGEMM_DEFAULT_UNROLL_N 2
+#define DGEMM_DEFAULT_UNROLL_N 2
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_M 2
+#define XGEMM_DEFAULT_UNROLL_M 1
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
+#define GEMV_UNROLL 8
+#endif
+
+#if defined(ARCH_X86_64)
+#define SGEMM_DEFAULT_P 768
+#define DGEMM_DEFAULT_P 576
+#define ZGEMM_DEFAULT_P 288
+#define CGEMM_DEFAULT_P 576
+#else
+#define SGEMM_DEFAULT_P 448
+#define DGEMM_DEFAULT_P 480
+#define ZGEMM_DEFAULT_P 112
+#define CGEMM_DEFAULT_P 224
+#endif
+#define QGEMM_DEFAULT_P 112
+#define XGEMM_DEFAULT_P  56
+
+#if defined(ARCH_X86_64)
+#define SGEMM_DEFAULT_Q 192
+#define DGEMM_DEFAULT_Q 160
+#define ZGEMM_DEFAULT_Q 160
+#define CGEMM_DEFAULT_Q 160
+#else
+#define SGEMM_DEFAULT_Q 224
+#define DGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 224
+#define CGEMM_DEFAULT_Q 224
+#endif
+#define QGEMM_DEFAULT_Q 224
+#define XGEMM_DEFAULT_Q 224
+
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
+#define SGEMM_DEFAULT_R 12288
+#define QGEMM_DEFAULT_R qgemm_r
+#define DGEMM_DEFAULT_R 12288
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SYMV_P  16
+#define HAVE_EXCLUSIVE_CACHE
+
+#define GEMM_THREAD gemm_thread_mn
+
+#endif
+
+
+#ifdef EXCAVATOR
 #define SNUMOPT         8
 #define DNUMOPT         4
 
@@ -1885,12 +1978,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
 #define SGEMM_DEFAULT_P  1280
-#define DGEMM_DEFAULT_P  640
+#define DGEMM_DEFAULT_P  768
 #define CGEMM_DEFAULT_P  640
 #define ZGEMM_DEFAULT_P  320
 
 #define SGEMM_DEFAULT_Q  640
-#define DGEMM_DEFAULT_Q  640
+#define DGEMM_DEFAULT_Q  768
 #define CGEMM_DEFAULT_Q  640
 #define ZGEMM_DEFAULT_Q  640