added sgemm- and strmm-kernel for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Mon, 14 Mar 2016 12:52:44 +0000 (13:52 +0100)
committerWerner Saar <wernsaar@googlemail.com>
Mon, 14 Mar 2016 12:52:44 +0000 (13:52 +0100)
kernel/power/KERNEL.POWER8
kernel/power/sgemm_kernel_16x8_power8.S [new file with mode: 0644]
kernel/power/sgemm_logic_16x8_power8.S [new file with mode: 0644]
kernel/power/sgemm_macros_16x8_power8.S [new file with mode: 0644]
kernel/power/strmm_kernel_16x8_power8.S [new file with mode: 0644]
kernel/power/strmm_logic_16x8_power8.S [new file with mode: 0644]
param.h

index 760d568..d40b20d 100644 (file)
@@ -3,14 +3,18 @@
 #CGEMM_BETA = ../generic/zgemm_beta.c
 #ZGEMM_BETA = ../generic/zgemm_beta.c
 
-STRMMKERNEL    = gemm_kernel_power6.S
+STRMMKERNEL    = strmm_kernel_16x8_power8.S
 DTRMMKERNEL    = dtrmm_kernel_16x4_power8.S
 CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 ZTRMMKERNEL    = ztrmm_kernel_8x2_power8.S
 
-SGEMMKERNEL    =  gemm_kernel_power6.S
-SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_16.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ =  sgemm_incopy.o
+SGEMMITCOPYOBJ =  sgemm_itcopy.o
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
@@ -146,7 +150,7 @@ ZTRSMKERNEL_RT      = ../generic/trsm_kernel_RT.c
 #SGEMVTKERNEL = ../arm/gemv_t.c
 #DGEMVTKERNEL = ../arm/gemv_t.c
 #CGEMVTKERNEL = ../arm/zgemv_t.c
-#ZGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = zgemv_t_4.c
 
 
 #SSYMV_U_KERNEL =  ../generic/symv_k.c
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
new file mode 100644 (file)
index 0000000..9f22130
--- /dev/null
@@ -0,0 +1,354 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO  304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO  232(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r7
+#define OFFSET r6
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs30
+#define alpha_vr vs31
+
+#define o0     0
+
+#define o4     r15
+#define o12    r16
+#define o8     r17
+#define L      r18
+#define T1     r19
+#define KK     r20
+#define BB     r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define        o48     r29
+
+#define PRE    r30
+#define T2     r31
+
+#include "sgemm_macros_16x8_power8.S"
+
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+#endif
+
+       // stfd f1,  ALPHA_SP
+       // stw  r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+       slwi    LDC, LDC, 2
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+
+       cmpwi   cr0, M, 0
+       ble     .L999_H1
+       cmpwi   cr0, N, 0
+       ble     .L999_H1
+       cmpwi   cr0, K, 0
+       ble     .L999_H1
+
+       li      PRE, 384 
+       li      o4 , 4
+       li      o8 , 8
+       li      o12, 12
+       li      o16, 16
+       li      o32, 32
+       li      o48, 48
+
+        addi    T1, SP, 300
+        stfs    f1, 0(T1)
+        stfs    f1, 4(T1)
+        stfs    f1, 8(T1)
+        stfs    f1,12(T1)
+
+        lxsspx  vs28, 0, T1
+
+        xxspltw alpha_r, vs28 , 0 
+        lxvw4x  alpha_vr, 0, T1
+
+
+
+#include "sgemm_logic_16x8_power8.S"
+
+.L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S
new file mode 100644 (file)
index 0000000..6c5a1c7
--- /dev/null
@@ -0,0 +1,2172 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+**************************************************************************************/
+
+
+       srawi.          J,      N,      3
+       ble             .LSGEMM_L8_END
+
+.LSGEMM_L8_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       3
+       add             C,      C,      T1
+       srawi.          I,      M,      4
+       ble             .LSGEMM_L8x16_END
+
+.LSGEMM_L8x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L8x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L8x16_SUB4
+
+.LSGEMM_L8x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD8x16_1
+       KERNEL8x16_I1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L8x16_LOOP_END
+
+       .align 5
+
+.LSGEMM_L8x16_LOOP:
+
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x16_LOOP
+
+.LSGEMM_L8x16_LOOP_END:
+
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       KERNEL8x16_1
+       KERNEL8x16_E2
+
+       b               .LSGEMM_L8x16_SUB1
+
+.LSGEMM_L8x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL8x16_SUBI1
+       KERNEL8x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+
+       b               .LSGEMM_L8x16_SUB1
+
+.LSGEMM_L8x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL8x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L8x16_SAVE
+       b               .LSGEMM_L8x16_SUB2
+
+.LSGEMM_L8x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L8x16_SAVE
+
+.LSGEMM_L8x16_SUB2:
+
+       KERNEL8x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x16_SUB2
+
+.LSGEMM_L8x16_SAVE:
+
+       SAVE8x16
+
+       addic.          I,      I,      -1
+       bgt             .LSGEMM_L8x16_BEGIN
+
+.LSGEMM_L8x16_END:
+
+.LSGEMM_L8x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             .LSGEMM_L8x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSGEMM_L8x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L8x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L8x8_SUB4
+
+.LSGEMM_L8x8_LOOP_START:
+
+       LOAD8x8_1
+       KERNEL8x8_I1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L8x8_LOOP_END
+
+       .align 5
+
+.LSGEMM_L8x8_LOOP:
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x8_LOOP
+
+.LSGEMM_L8x8_LOOP_END:
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_E2
+
+       b               .LSGEMM_L8x8_SUB1
+
+.LSGEMM_L8x8_SUB4:
+
+       KERNEL8x8_SUBI1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+
+       b               .LSGEMM_L8x8_SUB1
+
+.LSGEMM_L8x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL8x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L8x8_SAVE
+       b               .LSGEMM_L8x8_SUB2
+
+.LSGEMM_L8x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L8x8_SAVE
+
+.LSGEMM_L8x8_SUB2:
+
+       KERNEL8x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x8_SUB2
+
+.LSGEMM_L8x8_SAVE:
+
+       SAVE8x8
+
+.LSGEMM_L8x8_END:
+
+.LSGEMM_L8x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             .LSGEMM_L8x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L8x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L8x4_SUB4
+
+.LSGEMM_L8x4_LOOP_START:
+
+       LOAD8x4_1
+       KERNEL8x4_I1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L8x4_LOOP_END
+
+       .align 5
+
+.LSGEMM_L8x4_LOOP:
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x4_LOOP
+
+.LSGEMM_L8x4_LOOP_END:
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_E2
+
+       b               .LSGEMM_L8x4_SUB1
+
+.LSGEMM_L8x4_SUB4:
+
+       KERNEL8x4_SUBI1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+
+       b               .LSGEMM_L8x4_SUB1
+
+.LSGEMM_L8x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL8x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L8x4_SAVE
+       b               .LSGEMM_L8x4_SUB2
+
+.LSGEMM_L8x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L8x4_SAVE
+
+.LSGEMM_L8x4_SUB2:
+
+       KERNEL8x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x4_SUB2
+
+.LSGEMM_L8x4_SAVE:
+
+       SAVE8x4
+
+.LSGEMM_L8x4_END:
+
+.LSGEMM_L8x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LSGEMM_L8x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L8x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L8x2_SUB4
+
+.LSGEMM_L8x2_LOOP_START:
+
+       LOAD8x2_1
+       KERNEL8x2_I1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L8x2_LOOP_END
+
+       .align 5
+
+.LSGEMM_L8x2_LOOP:
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x2_LOOP
+
+.LSGEMM_L8x2_LOOP_END:
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_E2
+
+       b               .LSGEMM_L8x2_SUB1
+
+.LSGEMM_L8x2_SUB4:
+
+       KERNEL8x2_SUBI1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+
+       b               .LSGEMM_L8x2_SUB1
+
+.LSGEMM_L8x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL8x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L8x2_SAVE
+       b               .LSGEMM_L8x2_SUB2
+
+.LSGEMM_L8x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L8x2_SAVE
+
+.LSGEMM_L8x2_SUB2:
+
+       KERNEL8x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x2_SUB2
+
+.LSGEMM_L8x2_SAVE:
+
+       SAVE8x2
+
+.LSGEMM_L8x2_END:
+
+.LSGEMM_L8x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LSGEMM_L8x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L8x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L8x1_SUB4
+
+.LSGEMM_L8x1_LOOP_START:
+
+       LOAD8x1_1
+       KERNEL8x1_I1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L8x1_LOOP_END
+
+       .align 5
+
+.LSGEMM_L8x1_LOOP:
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x1_LOOP
+
+.LSGEMM_L8x1_LOOP_END:
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_E2
+
+       b               .LSGEMM_L8x1_SUB1
+
+.LSGEMM_L8x1_SUB4:
+
+       KERNEL8x1_SUBI1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+
+       b               .LSGEMM_L8x1_SUB1
+
+.LSGEMM_L8x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL8x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L8x1_SAVE
+       b               .LSGEMM_L8x1_SUB2
+
+.LSGEMM_L8x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L8x1_SAVE
+
+.LSGEMM_L8x1_SUB2:
+
+       KERNEL8x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L8x1_SUB2
+
+.LSGEMM_L8x1_SAVE:
+
+       SAVE8x1
+
+.LSGEMM_L8x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+       addic.          J,      J,      -1
+       bgt             .LSGEMM_L8_BEGIN
+
+       andi.           T2,     N,      7
+       ble             .L999
+
+.LSGEMM_L8_END:
+
+       b               .LSGEMM_L4_BEGIN
+
+.L999_H1:
+
+       b               .L999
+
+.LSGEMM_L4_BEGIN:
+
+       andi.           T1,     N,      4
+       ble             .LSGEMM_L4_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       2
+       add             C,      C,      T1
+       srawi.          I,      M,      4
+       ble             .LSGEMM_L4x16_END
+
+.LSGEMM_L4x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L4x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L4x16_SUB4
+
+.LSGEMM_L4x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD4x16_1
+       KERNEL4x16_I1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L4x16_LOOP_END
+
+       .align 5
+
+.LSGEMM_L4x16_LOOP:
+
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x16_LOOP
+
+.LSGEMM_L4x16_LOOP_END:
+
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       KERNEL4x16_1
+       KERNEL4x16_E2
+
+       b               .LSGEMM_L4x16_SUB1
+
+.LSGEMM_L4x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_SUBI1
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+
+       b               .LSGEMM_L4x16_SUB1
+
+.LSGEMM_L4x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L4x16_SAVE
+       b               .LSGEMM_L4x16_SUB2
+
+.LSGEMM_L4x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L4x16_SAVE
+
+.LSGEMM_L4x16_SUB2:
+
+       KERNEL4x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x16_SUB2
+
+.LSGEMM_L4x16_SAVE:
+
+       SAVE4x16
+
+       addic.          I,      I,      -1
+       bgt             .LSGEMM_L4x16_BEGIN
+
+.LSGEMM_L4x16_END:
+
+.LSGEMM_L4x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             .LSGEMM_L4x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSGEMM_L4x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L4x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L4x8_SUB4
+
+.LSGEMM_L4x8_LOOP_START:
+
+       LOAD4x8_1
+       KERNEL4x8_I1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L4x8_LOOP_END
+
+       .align 5
+
+.LSGEMM_L4x8_LOOP:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x8_LOOP
+
+.LSGEMM_L4x8_LOOP_END:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_E2
+
+       b               .LSGEMM_L4x8_SUB1
+
+.LSGEMM_L4x8_SUB4:
+
+       KERNEL4x8_SUBI1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       b               .LSGEMM_L4x8_SUB1
+
+.LSGEMM_L4x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L4x8_SAVE
+       b               .LSGEMM_L4x8_SUB2
+
+.LSGEMM_L4x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L4x8_SAVE
+
+.LSGEMM_L4x8_SUB2:
+
+       KERNEL4x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x8_SUB2
+
+.LSGEMM_L4x8_SAVE:
+
+       SAVE4x8
+
+.LSGEMM_L4x8_END:
+
+.LSGEMM_L4x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             .LSGEMM_L4x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L4x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L4x4_SUB4
+
+.LSGEMM_L4x4_LOOP_START:
+
+       LOAD4x4_1
+       KERNEL4x4_I1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L4x4_LOOP_END
+
+       .align 5
+
+.LSGEMM_L4x4_LOOP:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x4_LOOP
+
+.LSGEMM_L4x4_LOOP_END:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_E2
+
+       b               .LSGEMM_L4x4_SUB1
+
+.LSGEMM_L4x4_SUB4:
+
+       KERNEL4x4_SUBI1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       b               .LSGEMM_L4x4_SUB1
+
+.LSGEMM_L4x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L4x4_SAVE
+       b               .LSGEMM_L4x4_SUB2
+
+.LSGEMM_L4x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L4x4_SAVE
+
+.LSGEMM_L4x4_SUB2:
+
+       KERNEL4x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x4_SUB2
+
+.LSGEMM_L4x4_SAVE:
+
+       SAVE4x4
+
+.LSGEMM_L4x4_END:
+
+.LSGEMM_L4x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LSGEMM_L4x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L4x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L4x2_SUB4
+
+.LSGEMM_L4x2_LOOP_START:
+
+       LOAD4x2_1
+       KERNEL4x2_I1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L4x2_LOOP_END
+
+       .align 5
+
+.LSGEMM_L4x2_LOOP:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x2_LOOP
+
+.LSGEMM_L4x2_LOOP_END:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_E2
+
+       b               .LSGEMM_L4x2_SUB1
+
+.LSGEMM_L4x2_SUB4:
+
+       KERNEL4x2_SUBI1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       b               .LSGEMM_L4x2_SUB1
+
+.LSGEMM_L4x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L4x2_SAVE
+       b               .LSGEMM_L4x2_SUB2
+
+.LSGEMM_L4x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L4x2_SAVE
+
+.LSGEMM_L4x2_SUB2:
+
+       KERNEL4x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x2_SUB2
+
+.LSGEMM_L4x2_SAVE:
+
+       SAVE4x2
+
+.LSGEMM_L4x2_END:
+
+.LSGEMM_L4x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LSGEMM_L4x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L4x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L4x1_SUB4
+
+.LSGEMM_L4x1_LOOP_START:
+
+       LOAD4x1_1
+       KERNEL4x1_I1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L4x1_LOOP_END
+
+       .align 5
+
+.LSGEMM_L4x1_LOOP:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x1_LOOP
+
+.LSGEMM_L4x1_LOOP_END:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_E2
+
+       b               .LSGEMM_L4x1_SUB1
+
+.LSGEMM_L4x1_SUB4:
+
+       KERNEL4x1_SUBI1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       b               .LSGEMM_L4x1_SUB1
+
+.LSGEMM_L4x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L4x1_SAVE
+       b               .LSGEMM_L4x1_SUB2
+
+.LSGEMM_L4x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L4x1_SAVE
+
+.LSGEMM_L4x1_SUB2:
+
+       KERNEL4x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L4x1_SUB2
+
+.LSGEMM_L4x1_SAVE:
+
+       SAVE4x1
+
+.LSGEMM_L4x1_END:
+
+       slwi            T1,     K,      4
+       add             B,      B,      T1
+
+.LSGEMM_L4_END:
+.LSGEMM_L2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             .LSGEMM_L2_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+       srawi.          I,      M,      4
+       ble             .LSGEMM_L2x16_END
+
+.LSGEMM_L2x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L2x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L2x16_SUB4
+
+.LSGEMM_L2x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x16_1
+       KERNEL2x16_I1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L2x16_LOOP_END
+
+       .align 5
+
+.LSGEMM_L2x16_LOOP:
+
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x16_LOOP
+
+.LSGEMM_L2x16_LOOP_END:
+
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       KERNEL2x16_1
+       KERNEL2x16_E2
+
+       b               .LSGEMM_L2x16_SUB1
+
+.LSGEMM_L2x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_SUBI1
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+
+       b               .LSGEMM_L2x16_SUB1
+
+.LSGEMM_L2x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L2x16_SAVE
+       b               .LSGEMM_L2x16_SUB2
+
+.LSGEMM_L2x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L2x16_SAVE
+
+.LSGEMM_L2x16_SUB2:
+
+       KERNEL2x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x16_SUB2
+
+.LSGEMM_L2x16_SAVE:
+
+       SAVE2x16
+
+       addic.          I,      I,      -1
+       bgt             .LSGEMM_L2x16_BEGIN
+
+.LSGEMM_L2x16_END:
+
+.LSGEMM_L2x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             .LSGEMM_L2x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSGEMM_L2x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L2x8_SUB4
+
+.LSGEMM_L2x8_LOOP_START:
+
+       LOAD2x8_1
+       KERNEL2x8_I1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L2x8_LOOP_END
+
+       .align 5
+
+.LSGEMM_L2x8_LOOP:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x8_LOOP
+
+.LSGEMM_L2x8_LOOP_END:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               .LSGEMM_L2x8_SUB1
+
+.LSGEMM_L2x8_SUB4:
+
+       KERNEL2x8_SUBI1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               .LSGEMM_L2x8_SUB1
+
+.LSGEMM_L2x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L2x8_SAVE
+       b               .LSGEMM_L2x8_SUB2
+
+.LSGEMM_L2x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L2x8_SAVE
+
+.LSGEMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x8_SUB2
+
+.LSGEMM_L2x8_SAVE:
+
+       SAVE2x8
+
+.LSGEMM_L2x8_END:
+
+.LSGEMM_L2x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             .LSGEMM_L2x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L2x4_SUB4
+
+.LSGEMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L2x4_LOOP_END
+
+       .align 5
+
+.LSGEMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x4_LOOP
+
+.LSGEMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               .LSGEMM_L2x4_SUB1
+
+.LSGEMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               .LSGEMM_L2x4_SUB1
+
+.LSGEMM_L2x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L2x4_SAVE
+       b               .LSGEMM_L2x4_SUB2
+
+.LSGEMM_L2x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L2x4_SAVE
+
+.LSGEMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x4_SUB2
+
+.LSGEMM_L2x4_SAVE:
+
+       SAVE2x4
+
+.LSGEMM_L2x4_END:
+
+.LSGEMM_L2x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LSGEMM_L2x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L2x2_SUB4
+
+.LSGEMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L2x2_LOOP_END
+
+       .align 5
+
+.LSGEMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x2_LOOP
+
+.LSGEMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               .LSGEMM_L2x2_SUB1
+
+.LSGEMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               .LSGEMM_L2x2_SUB1
+
+.LSGEMM_L2x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L2x2_SAVE
+       b               .LSGEMM_L2x2_SUB2
+
+.LSGEMM_L2x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L2x2_SAVE
+
+.LSGEMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x2_SUB2
+
+.LSGEMM_L2x2_SAVE:
+
+       SAVE2x2
+
+.LSGEMM_L2x2_END:
+
+.LSGEMM_L2x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LSGEMM_L2x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L2x1_SUB4
+
+.LSGEMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L2x1_LOOP_END
+
+       .align 5
+
+.LSGEMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x1_LOOP
+
+.LSGEMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               .LSGEMM_L2x1_SUB1
+
+.LSGEMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               .LSGEMM_L2x1_SUB1
+
+.LSGEMM_L2x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L2x1_SAVE
+       b               .LSGEMM_L2x1_SUB2
+
+.LSGEMM_L2x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L2x1_SAVE
+
+.LSGEMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L2x1_SUB2
+
+.LSGEMM_L2x1_SAVE:
+
+       SAVE2x1
+
+.LSGEMM_L2x1_END:
+
+       slwi            T1,     K,      3
+       add             B,      B,      T1
+
+.LSGEMM_L2_END:
+.LSGEMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             .LSGEMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+       srawi.          I,      M,      4
+       ble             .LSGEMM_L1x16_END
+
+.LSGEMM_L1x16_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L1x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L1x16_SUB4
+
+.LSGEMM_L1x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x16_1
+       KERNEL1x16_I1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L1x16_LOOP_END
+
+       .align 5
+
+.LSGEMM_L1x16_LOOP:
+
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x16_LOOP
+
+.LSGEMM_L1x16_LOOP_END:
+
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       KERNEL1x16_1
+       KERNEL1x16_E2
+
+       b               .LSGEMM_L1x16_SUB1
+
+.LSGEMM_L1x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_SUBI1
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+
+       b               .LSGEMM_L1x16_SUB1
+
+.LSGEMM_L1x16_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L1x16_SAVE
+       b               .LSGEMM_L1x16_SUB2
+
+.LSGEMM_L1x16_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L1x16_SAVE
+
+.LSGEMM_L1x16_SUB2:
+
+       KERNEL1x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x16_SUB2
+
+.LSGEMM_L1x16_SAVE:
+
+       SAVE1x16
+
+       addic.          I,      I,      -1
+       bgt             .LSGEMM_L1x16_BEGIN
+
+.LSGEMM_L1x16_END:
+
+.LSGEMM_L1x8_BEGIN:
+
+       andi.           T2,     M,      15
+       ble             .LSGEMM_L1x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSGEMM_L1x8_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L1x8_SUB4
+
+.LSGEMM_L1x8_LOOP_START:
+
+       LOAD1x8_1
+       KERNEL1x8_I1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L1x8_LOOP_END
+
+       .align 5
+
+.LSGEMM_L1x8_LOOP:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x8_LOOP
+
+.LSGEMM_L1x8_LOOP_END:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               .LSGEMM_L1x8_SUB1
+
+.LSGEMM_L1x8_SUB4:
+
+       KERNEL1x8_SUBI1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               .LSGEMM_L1x8_SUB1
+
+.LSGEMM_L1x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L1x8_SAVE
+       b               .LSGEMM_L1x8_SUB2
+
+.LSGEMM_L1x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L1x8_SAVE
+
+.LSGEMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x8_SUB2
+
+.LSGEMM_L1x8_SAVE:
+
+       SAVE1x8
+
+.LSGEMM_L1x8_END:
+
+.LSGEMM_L1x4_BEGIN:
+
+
+       andi.           T1,     M,      4
+       ble             .LSGEMM_L1x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L1x4_SUB4
+
+.LSGEMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L1x4_LOOP_END
+
+       .align 5
+
+.LSGEMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x4_LOOP
+
+.LSGEMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               .LSGEMM_L1x4_SUB1
+
+.LSGEMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               .LSGEMM_L1x4_SUB1
+
+.LSGEMM_L1x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L1x4_SAVE
+       b               .LSGEMM_L1x4_SUB2
+
+.LSGEMM_L1x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L1x4_SAVE
+
+.LSGEMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x4_SUB2
+
+.LSGEMM_L1x4_SAVE:
+
+       SAVE1x4
+
+.LSGEMM_L1x4_END:
+
+.LSGEMM_L1x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LSGEMM_L1x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L1x2_SUB4
+
+.LSGEMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L1x2_LOOP_END
+
+       .align 5
+
+.LSGEMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x2_LOOP
+
+.LSGEMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               .LSGEMM_L1x2_SUB1
+
+.LSGEMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               .LSGEMM_L1x2_SUB1
+
+.LSGEMM_L1x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L1x2_SAVE
+       b               .LSGEMM_L1x2_SUB2
+
+.LSGEMM_L1x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L1x2_SAVE
+
+.LSGEMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x2_SUB2
+
+.LSGEMM_L1x2_SAVE:
+
+       SAVE1x2
+
+.LSGEMM_L1x2_END:
+
+.LSGEMM_L1x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LSGEMM_L1x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LSGEMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSGEMM_L1x1_SUB4
+
+.LSGEMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSGEMM_L1x1_LOOP_END
+
+       .align 5
+
+.LSGEMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x1_LOOP
+
+.LSGEMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               .LSGEMM_L1x1_SUB1
+
+.LSGEMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               .LSGEMM_L1x1_SUB1
+
+.LSGEMM_L1x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSGEMM_L1x1_SAVE
+       b               .LSGEMM_L1x1_SUB2
+
+.LSGEMM_L1x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LSGEMM_L1x1_SAVE
+
+.LSGEMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSGEMM_L1x1_SUB2
+
+.LSGEMM_L1x1_SAVE:
+
+       SAVE1x1
+
+.LSGEMM_L1x1_END:
+
+.LSGEMM_L1_END:
diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S
new file mode 100644 (file)
index 0000000..78f530c
--- /dev/null
@@ -0,0 +1,6145 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+**************************************************************************************/
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+       lxvw4x          vs28,   o0,     BO
+       lxvw4x          vs29,   o16,    BO
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+
+.endm
+
+.macro KERNEL8x16_I1
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       lxvw4x          vs28,   o0,     BO
+       lxvw4x          vs29,   o16,    BO
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+       xvmulsp         vs48,   vs0,    vs12
+       xvmulsp         vs49,   vs1,    vs12
+       xvmulsp         vs50,   vs2,    vs12
+       xvmulsp         vs51,   vs3,    vs12
+
+       xvmulsp         vs52,   vs0,    vs13
+       xvmulsp         vs53,   vs1,    vs13
+       xvmulsp         vs54,   vs2,    vs13
+       xvmulsp         vs55,   vs3,    vs13
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       xvmulsp         vs56,   vs0,    vs14
+       xvmulsp         vs57,   vs1,    vs14
+       xvmulsp         vs58,   vs2,    vs14
+       xvmulsp         vs59,   vs3,    vs14
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       xvmulsp         vs60,   vs0,    vs15
+       xvmulsp         vs61,   vs1,    vs15
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+       xvmulsp         vs62,   vs2,    vs15
+       xvmulsp         vs63,   vs3,    vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       lxvw4x          vs28,   o0,     BO
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+
+       lxvw4x          vs29,   o16,    BO
+
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+       xvmaddasp       vs48,   vs0,    vs12
+       xvmaddasp       vs49,   vs1,    vs12
+       xvmaddasp       vs50,   vs2,    vs12
+       xvmaddasp       vs51,   vs3,    vs12
+
+       xvmaddasp       vs52,   vs0,    vs13
+       xvmaddasp       vs53,   vs1,    vs13
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       xvmaddasp       vs54,   vs2,    vs13
+       xvmaddasp       vs55,   vs3,    vs13
+
+       xvmaddasp       vs56,   vs0,    vs14
+       xvmaddasp       vs57,   vs1,    vs14
+
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       xvmaddasp       vs58,   vs2,    vs14
+       xvmaddasp       vs59,   vs3,    vs14
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+
+       xvmaddasp       vs60,   vs0,    vs15
+       xvmaddasp       vs61,   vs1,    vs15
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+       xvmaddasp       vs62,   vs2,    vs15
+       xvmaddasp       vs63,   vs3,    vs15
+
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+.endm
+
+.macro KERNEL8x16_2
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+
+       lxvw4x          vs28,   o0,     BO
+
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+
+       lxvw4x          vs29,   o16,    BO
+
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+       xvmaddasp       vs48,   vs4,    vs20
+       xvmaddasp       vs49,   vs5,    vs20
+       xvmaddasp       vs50,   vs6,    vs20
+       xvmaddasp       vs51,   vs7,    vs20
+
+       xvmaddasp       vs52,   vs4,    vs21
+       xvmaddasp       vs53,   vs5,    vs21
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       xvmaddasp       vs54,   vs6,    vs21
+       xvmaddasp       vs55,   vs7,    vs21
+
+       xvmaddasp       vs56,   vs4,    vs22
+       xvmaddasp       vs57,   vs5,    vs22
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       xvmaddasp       vs58,   vs6,    vs22
+       xvmaddasp       vs59,   vs7,    vs22
+
+       xvmaddasp       vs60,   vs4,    vs23
+       xvmaddasp       vs61,   vs5,    vs23
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+       xvmaddasp       vs62,   vs6,    vs23
+       xvmaddasp       vs63,   vs7,    vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+       xvmaddasp       vs48,   vs4,    vs20
+       xvmaddasp       vs49,   vs5,    vs20
+       xvmaddasp       vs50,   vs6,    vs20
+       xvmaddasp       vs51,   vs7,    vs20
+
+       xvmaddasp       vs52,   vs4,    vs21
+       xvmaddasp       vs53,   vs5,    vs21
+       xvmaddasp       vs54,   vs6,    vs21
+       xvmaddasp       vs55,   vs7,    vs21
+
+       xvmaddasp       vs56,   vs4,    vs22
+       xvmaddasp       vs57,   vs5,    vs22
+       xvmaddasp       vs58,   vs6,    vs22
+       xvmaddasp       vs59,   vs7,    vs22
+
+       xvmaddasp       vs60,   vs4,    vs23
+       xvmaddasp       vs61,   vs5,    vs23
+       xvmaddasp       vs62,   vs6,    vs23
+       xvmaddasp       vs63,   vs7,    vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+       xvmulsp         vs48,   vs0,    vs12
+       xvmulsp         vs49,   vs1,    vs12
+       xvmulsp         vs50,   vs2,    vs12
+       xvmulsp         vs51,   vs3,    vs12
+
+       xvmulsp         vs52,   vs0,    vs13
+       xvmulsp         vs53,   vs1,    vs13
+       xvmulsp         vs54,   vs2,    vs13
+       xvmulsp         vs55,   vs3,    vs13
+
+       xvmulsp         vs56,   vs0,    vs14
+       xvmulsp         vs57,   vs1,    vs14
+       xvmulsp         vs58,   vs2,    vs14
+       xvmulsp         vs59,   vs3,    vs14
+
+       xvmulsp         vs60,   vs0,    vs15
+       xvmulsp         vs61,   vs1,    vs15
+       xvmulsp         vs62,   vs2,    vs15
+       xvmulsp         vs63,   vs3,    vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+       xvmaddasp       vs48,   vs0,    vs12
+       xvmaddasp       vs49,   vs1,    vs12
+       xvmaddasp       vs50,   vs2,    vs12
+       xvmaddasp       vs51,   vs3,    vs12
+
+       xvmaddasp       vs52,   vs0,    vs13
+       xvmaddasp       vs53,   vs1,    vs13
+       xvmaddasp       vs54,   vs2,    vs13
+       xvmaddasp       vs55,   vs3,    vs13
+
+       xvmaddasp       vs56,   vs0,    vs14
+       xvmaddasp       vs57,   vs1,    vs14
+       xvmaddasp       vs58,   vs2,    vs14
+       xvmaddasp       vs59,   vs3,    vs14
+
+       xvmaddasp       vs60,   vs0,    vs15
+       xvmaddasp       vs61,   vs1,    vs15
+       xvmaddasp       vs62,   vs2,    vs15
+       xvmaddasp       vs63,   vs3,    vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+       xvmulsp         vs2,    vs42,   alpha_vr
+       xvmulsp         vs3,    vs43,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+       xvmaddasp       vs2,    vs42,   alpha_vr
+       xvmaddasp       vs3,    vs43,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+       xvmulsp         vs2,    vs46,   alpha_vr
+       xvmulsp         vs3,    vs47,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+       xvmaddasp       vs2,    vs46,   alpha_vr
+       xvmaddasp       vs3,    vs47,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs48,   alpha_vr
+       xvmulsp         vs1,    vs49,   alpha_vr
+       xvmulsp         vs2,    vs50,   alpha_vr
+       xvmulsp         vs3,    vs51,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs48,   alpha_vr
+       xvmaddasp       vs1,    vs49,   alpha_vr
+       xvmaddasp       vs2,    vs50,   alpha_vr
+       xvmaddasp       vs3,    vs51,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs52,   alpha_vr
+       xvmulsp         vs1,    vs53,   alpha_vr
+       xvmulsp         vs2,    vs54,   alpha_vr
+       xvmulsp         vs3,    vs55,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs52,   alpha_vr
+       xvmaddasp       vs1,    vs53,   alpha_vr
+       xvmaddasp       vs2,    vs54,   alpha_vr
+       xvmaddasp       vs3,    vs55,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs56,   alpha_vr
+       xvmulsp         vs1,    vs57,   alpha_vr
+       xvmulsp         vs2,    vs58,   alpha_vr
+       xvmulsp         vs3,    vs59,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs56,   alpha_vr
+       xvmaddasp       vs1,    vs57,   alpha_vr
+       xvmaddasp       vs2,    vs58,   alpha_vr
+       xvmaddasp       vs3,    vs59,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs60,   alpha_vr
+       xvmulsp         vs1,    vs61,   alpha_vr
+       xvmulsp         vs2,    vs62,   alpha_vr
+       xvmulsp         vs3,    vs63,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs60,   alpha_vr
+       xvmaddasp       vs1,    vs61,   alpha_vr
+       xvmaddasp       vs2,    vs62,   alpha_vr
+       xvmaddasp       vs3,    vs63,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+       xvmulsp         vs40,   vs0,    vs12
+       xvmulsp         vs41,   vs1,    vs12
+
+       xvmulsp         vs42,   vs0,    vs13
+       xvmulsp         vs43,   vs1,    vs13
+
+       xvmulsp         vs44,   vs0,    vs14
+       xvmulsp         vs45,   vs1,    vs14
+
+       xvmulsp         vs46,   vs0,    vs15
+       xvmulsp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+       xvmaddasp       vs40,   vs0,    vs12
+       xvmaddasp       vs41,   vs1,    vs12
+
+       xvmaddasp       vs42,   vs0,    vs13
+       xvmaddasp       vs43,   vs1,    vs13
+
+       xvmaddasp       vs44,   vs0,    vs14
+       xvmaddasp       vs45,   vs1,    vs14
+
+       xvmaddasp       vs46,   vs0,    vs15
+       xvmaddasp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+       xvmaddasp       vs40,   vs4,    vs20
+       xvmaddasp       vs41,   vs5,    vs20
+
+       xvmaddasp       vs42,   vs4,    vs21
+       xvmaddasp       vs43,   vs5,    vs21
+
+       xvmaddasp       vs44,   vs4,    vs22
+       xvmaddasp       vs45,   vs5,    vs22
+
+       xvmaddasp       vs46,   vs4,    vs23
+       xvmaddasp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+       xvmaddasp       vs40,   vs4,    vs20
+       xvmaddasp       vs41,   vs5,    vs20
+
+       xvmaddasp       vs42,   vs4,    vs21
+       xvmaddasp       vs43,   vs5,    vs21
+
+       xvmaddasp       vs44,   vs4,    vs22
+       xvmaddasp       vs45,   vs5,    vs22
+
+       xvmaddasp       vs46,   vs4,    vs23
+       xvmaddasp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+       xvmulsp         vs40,   vs0,    vs12
+       xvmulsp         vs41,   vs1,    vs12
+
+       xvmulsp         vs42,   vs0,    vs13
+       xvmulsp         vs43,   vs1,    vs13
+
+       xvmulsp         vs44,   vs0,    vs14
+       xvmulsp         vs45,   vs1,    vs14
+
+       xvmulsp         vs46,   vs0,    vs15
+       xvmulsp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+       xvmaddasp       vs40,   vs0,    vs12
+       xvmaddasp       vs41,   vs1,    vs12
+
+       xvmaddasp       vs42,   vs0,    vs13
+       xvmaddasp       vs43,   vs1,    vs13
+
+       xvmaddasp       vs44,   vs0,    vs14
+       xvmaddasp       vs45,   vs1,    vs14
+
+       xvmaddasp       vs46,   vs0,    vs15
+       xvmaddasp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs38,   alpha_vr
+       xvmulsp         vs1,    vs39,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs38,   alpha_vr
+       xvmaddasp       vs1,    vs39,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs42,   alpha_vr
+       xvmulsp         vs1,    vs43,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs42,   alpha_vr
+       xvmaddasp       vs1,    vs43,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs46,   alpha_vr
+       xvmulsp         vs1,    vs47,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs46,   alpha_vr
+       xvmaddasp       vs1,    vs47,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+       xvmulsp         vs36,   vs0,    vs12
+
+       xvmulsp         vs37,   vs0,    vs13
+
+       xvmulsp         vs38,   vs0,    vs14
+
+       xvmulsp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+       xvmaddasp       vs36,   vs0,    vs12
+
+       xvmaddasp       vs37,   vs0,    vs13
+
+       xvmaddasp       vs38,   vs0,    vs14
+
+       xvmaddasp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+       xvmaddasp       vs36,   vs4,    vs20
+
+       xvmaddasp       vs37,   vs4,    vs21
+
+       xvmaddasp       vs38,   vs4,    vs22
+
+       xvmaddasp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+       xvmaddasp       vs36,   vs4,    vs20
+
+       xvmaddasp       vs37,   vs4,    vs21
+
+       xvmaddasp       vs38,   vs4,    vs22
+
+       xvmaddasp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+       xvmulsp         vs36,   vs0,    vs12
+
+       xvmulsp         vs37,   vs0,    vs13
+
+       xvmulsp         vs38,   vs0,    vs14
+
+       xvmulsp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+       xvmaddasp       vs36,   vs0,    vs12
+
+       xvmaddasp       vs37,   vs0,    vs13
+
+       xvmaddasp       vs38,   vs0,    vs14
+
+       xvmaddasp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs34,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs34,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs36,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs36,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs37,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs37,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs38,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs38,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs39,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs39,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+       xsmulsp         vs34,   vs0,    vs9
+       xsmulsp         vs35,   vs1,    vs9
+
+       xsmulsp         vs36,   vs0,    vs10
+       xsmulsp         vs37,   vs1,    vs10
+
+       xsmulsp         vs38,   vs0,    vs11
+       xsmulsp         vs39,   vs1,    vs11
+
+       xsmulsp         vs40,   vs0,    vs12
+       xsmulsp         vs41,   vs1,    vs12
+
+       xsmulsp         vs42,   vs0,    vs13
+       xsmulsp         vs43,   vs1,    vs13
+
+       xsmulsp         vs44,   vs0,    vs14
+       xsmulsp         vs45,   vs1,    vs14
+
+       xsmulsp         vs46,   vs0,    vs15
+       xsmulsp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+       xsmaddasp       vs34,   vs0,    vs9
+       xsmaddasp       vs35,   vs1,    vs9
+
+       xsmaddasp       vs36,   vs0,    vs10
+       xsmaddasp       vs37,   vs1,    vs10
+
+       xsmaddasp       vs38,   vs0,    vs11
+       xsmaddasp       vs39,   vs1,    vs11
+
+       xsmaddasp       vs40,   vs0,    vs12
+       xsmaddasp       vs41,   vs1,    vs12
+
+       xsmaddasp       vs42,   vs0,    vs13
+       xsmaddasp       vs43,   vs1,    vs13
+
+       xsmaddasp       vs44,   vs0,    vs14
+       xsmaddasp       vs45,   vs1,    vs14
+
+       xsmaddasp       vs46,   vs0,    vs15
+       xsmaddasp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+       xsmaddasp       vs34,   vs4,    vs17
+       xsmaddasp       vs35,   vs5,    vs17
+
+       xsmaddasp       vs36,   vs4,    vs18
+       xsmaddasp       vs37,   vs5,    vs18
+
+       xsmaddasp       vs38,   vs4,    vs19
+       xsmaddasp       vs39,   vs5,    vs19
+
+       xsmaddasp       vs40,   vs4,    vs20
+       xsmaddasp       vs41,   vs5,    vs20
+
+       xsmaddasp       vs42,   vs4,    vs21
+       xsmaddasp       vs43,   vs5,    vs21
+
+       xsmaddasp       vs44,   vs4,    vs22
+       xsmaddasp       vs45,   vs5,    vs22
+
+       xsmaddasp       vs46,   vs4,    vs23
+       xsmaddasp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+       xsmaddasp       vs34,   vs4,    vs17
+       xsmaddasp       vs35,   vs5,    vs17
+
+       xsmaddasp       vs36,   vs4,    vs18
+       xsmaddasp       vs37,   vs5,    vs18
+
+       xsmaddasp       vs38,   vs4,    vs19
+       xsmaddasp       vs39,   vs5,    vs19
+
+       xsmaddasp       vs40,   vs4,    vs20
+       xsmaddasp       vs41,   vs5,    vs20
+
+       xsmaddasp       vs42,   vs4,    vs21
+       xsmaddasp       vs43,   vs5,    vs21
+
+       xsmaddasp       vs44,   vs4,    vs22
+       xsmaddasp       vs45,   vs5,    vs22
+
+       xsmaddasp       vs46,   vs4,    vs23
+       xsmaddasp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+       xsmulsp         vs34,   vs0,    vs9
+       xsmulsp         vs35,   vs1,    vs9
+
+       xsmulsp         vs36,   vs0,    vs10
+       xsmulsp         vs37,   vs1,    vs10
+
+       xsmulsp         vs38,   vs0,    vs11
+       xsmulsp         vs39,   vs1,    vs11
+
+       xsmulsp         vs40,   vs0,    vs12
+       xsmulsp         vs41,   vs1,    vs12
+
+       xsmulsp         vs42,   vs0,    vs13
+       xsmulsp         vs43,   vs1,    vs13
+
+       xsmulsp         vs44,   vs0,    vs14
+       xsmulsp         vs45,   vs1,    vs14
+
+       xsmulsp         vs46,   vs0,    vs15
+       xsmulsp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+       xsmaddasp       vs34,   vs0,    vs9
+       xsmaddasp       vs35,   vs1,    vs9
+
+       xsmaddasp       vs36,   vs0,    vs10
+       xsmaddasp       vs37,   vs1,    vs10
+
+       xsmaddasp       vs38,   vs0,    vs11
+       xsmaddasp       vs39,   vs1,    vs11
+
+       xsmaddasp       vs40,   vs0,    vs12
+       xsmaddasp       vs41,   vs1,    vs12
+
+       xsmaddasp       vs42,   vs0,    vs13
+       xsmaddasp       vs43,   vs1,    vs13
+
+       xsmaddasp       vs44,   vs0,    vs14
+       xsmaddasp       vs45,   vs1,    vs14
+
+       xsmaddasp       vs46,   vs0,    vs15
+       xsmaddasp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+       xsmulsp         vs1,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+       xsmaddasp       vs1,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs34,   alpha_r
+       xsmulsp         vs1,    vs35,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs34,   alpha_r
+       xsmaddasp       vs1,    vs35,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs36,   alpha_r
+       xsmulsp         vs1,    vs37,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs36,   alpha_r
+       xsmaddasp       vs1,    vs37,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs38,   alpha_r
+       xsmulsp         vs1,    vs39,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs38,   alpha_r
+       xsmaddasp       vs1,    vs39,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs40,   alpha_r
+       xsmulsp         vs1,    vs41,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs40,   alpha_r
+       xsmaddasp       vs1,    vs41,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs42,   alpha_r
+       xsmulsp         vs1,    vs43,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs42,   alpha_r
+       xsmaddasp       vs1,    vs43,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs44,   alpha_r
+       xsmulsp         vs1,    vs45,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs44,   alpha_r
+       xsmaddasp       vs1,    vs45,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs46,   alpha_r
+       xsmulsp         vs1,    vs47,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs46,   alpha_r
+       xsmaddasp       vs1,    vs47,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+       xsmulsp         vs33,   vs0,    vs9
+
+       xsmulsp         vs34,   vs0,    vs10
+
+       xsmulsp         vs35,   vs0,    vs11
+
+       xsmulsp         vs36,   vs0,    vs12
+
+       xsmulsp         vs37,   vs0,    vs13
+
+       xsmulsp         vs38,   vs0,    vs14
+
+       xsmulsp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+       xsmaddasp       vs33,   vs0,    vs9
+
+       xsmaddasp       vs34,   vs0,    vs10
+
+       xsmaddasp       vs35,   vs0,    vs11
+
+       xsmaddasp       vs36,   vs0,    vs12
+
+       xsmaddasp       vs37,   vs0,    vs13
+
+       xsmaddasp       vs38,   vs0,    vs14
+
+       xsmaddasp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+       xsmaddasp       vs33,   vs4,    vs17
+
+       xsmaddasp       vs34,   vs4,    vs18
+
+       xsmaddasp       vs35,   vs4,    vs19
+
+       xsmaddasp       vs36,   vs4,    vs20
+
+       xsmaddasp       vs37,   vs4,    vs21
+
+       xsmaddasp       vs38,   vs4,    vs22
+
+       xsmaddasp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+       xsmaddasp       vs33,   vs4,    vs17
+
+       xsmaddasp       vs34,   vs4,    vs18
+
+       xsmaddasp       vs35,   vs4,    vs19
+
+       xsmaddasp       vs36,   vs4,    vs20
+
+       xsmaddasp       vs37,   vs4,    vs21
+
+       xsmaddasp       vs38,   vs4,    vs22
+
+       xsmaddasp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+       xsmulsp         vs33,   vs0,    vs9
+
+       xsmulsp         vs34,   vs0,    vs10
+
+       xsmulsp         vs35,   vs0,    vs11
+
+       xsmulsp         vs36,   vs0,    vs12
+
+       xsmulsp         vs37,   vs0,    vs13
+
+       xsmulsp         vs38,   vs0,    vs14
+
+       xsmulsp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+       xsmaddasp       vs33,   vs0,    vs9
+
+       xsmaddasp       vs34,   vs0,    vs10
+
+       xsmaddasp       vs35,   vs0,    vs11
+
+       xsmaddasp       vs36,   vs0,    vs12
+
+       xsmaddasp       vs37,   vs0,    vs13
+
+       xsmaddasp       vs38,   vs0,    vs14
+
+       xsmaddasp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs34,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs34,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs35,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs35,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs36,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs36,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs37,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs37,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs38,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs38,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs39,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs39,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+       xvmulsp         vs2,    vs42,   alpha_vr
+       xvmulsp         vs3,    vs43,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+       xvmaddasp       vs2,    vs42,   alpha_vr
+       xvmaddasp       vs3,    vs43,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+       xvmulsp         vs2,    vs46,   alpha_vr
+       xvmulsp         vs3,    vs47,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+       xvmaddasp       vs2,    vs46,   alpha_vr
+       xvmaddasp       vs3,    vs47,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs38,   alpha_vr
+       xvmulsp         vs1,    vs39,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs38,   alpha_vr
+       xvmaddasp       vs1,    vs39,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs34,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs34,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+       xsmulsp         vs34,   vs0,    vs9
+       xsmulsp         vs35,   vs1,    vs9
+
+       xsmulsp         vs36,   vs0,    vs10
+       xsmulsp         vs37,   vs1,    vs10
+
+       xsmulsp         vs38,   vs0,    vs11
+       xsmulsp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+       xsmaddasp       vs34,   vs0,    vs9
+       xsmaddasp       vs35,   vs1,    vs9
+
+       xsmaddasp       vs36,   vs0,    vs10
+       xsmaddasp       vs37,   vs1,    vs10
+
+       xsmaddasp       vs38,   vs0,    vs11
+       xsmaddasp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+       xsmaddasp       vs34,   vs4,    vs17
+       xsmaddasp       vs35,   vs5,    vs17
+
+       xsmaddasp       vs36,   vs4,    vs18
+       xsmaddasp       vs37,   vs5,    vs18
+
+       xsmaddasp       vs38,   vs4,    vs19
+       xsmaddasp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+       xsmaddasp       vs34,   vs4,    vs17
+       xsmaddasp       vs35,   vs5,    vs17
+
+       xsmaddasp       vs36,   vs4,    vs18
+       xsmaddasp       vs37,   vs5,    vs18
+
+       xsmaddasp       vs38,   vs4,    vs19
+       xsmaddasp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+       xsmulsp         vs34,   vs0,    vs9
+       xsmulsp         vs35,   vs1,    vs9
+
+       xsmulsp         vs36,   vs0,    vs10
+       xsmulsp         vs37,   vs1,    vs10
+
+       xsmulsp         vs38,   vs0,    vs11
+       xsmulsp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+       xsmaddasp       vs34,   vs0,    vs9
+       xsmaddasp       vs35,   vs1,    vs9
+
+       xsmaddasp       vs36,   vs0,    vs10
+       xsmaddasp       vs37,   vs1,    vs10
+
+       xsmaddasp       vs38,   vs0,    vs11
+       xsmaddasp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+       xsmulsp         vs1,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+       xsmaddasp       vs1,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs34,   alpha_r
+       xsmulsp         vs1,    vs35,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs34,   alpha_r
+       xsmaddasp       vs1,    vs35,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs36,   alpha_r
+       xsmulsp         vs1,    vs37,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs36,   alpha_r
+       xsmaddasp       vs1,    vs37,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs38,   alpha_r
+       xsmulsp         vs1,    vs39,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs38,   alpha_r
+       xsmaddasp       vs1,    vs39,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+       xsmulsp         vs33,   vs0,    vs9
+
+       xsmulsp         vs34,   vs0,    vs10
+
+       xsmulsp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+       xsmaddasp       vs33,   vs0,    vs9
+
+       xsmaddasp       vs34,   vs0,    vs10
+
+       xsmaddasp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+       xsmaddasp       vs33,   vs4,    vs17
+
+       xsmaddasp       vs34,   vs4,    vs18
+
+       xsmaddasp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+       xsmaddasp       vs33,   vs4,    vs17
+
+       xsmaddasp       vs34,   vs4,    vs18
+
+       xsmaddasp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+       xsmulsp         vs33,   vs0,    vs9
+
+       xsmulsp         vs34,   vs0,    vs10
+
+       xsmulsp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+       xsmaddasp       vs33,   vs0,    vs9
+
+       xsmaddasp       vs34,   vs0,    vs10
+
+       xsmaddasp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs34,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs34,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs35,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs35,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+       xsmulsp         vs34,   vs0,    vs9
+       xsmulsp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+       xsmaddasp       vs34,   vs0,    vs9
+       xsmaddasp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+       xsmaddasp       vs34,   vs4,    vs17
+       xsmaddasp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+       xsmaddasp       vs34,   vs4,    vs17
+       xsmaddasp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+       xsmulsp         vs34,   vs0,    vs9
+       xsmulsp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+       xsmaddasp       vs34,   vs0,    vs9
+       xsmaddasp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+       xsmulsp         vs1,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+       xsmaddasp       vs1,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs34,   alpha_r
+       xsmulsp         vs1,    vs35,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs34,   alpha_r
+       xsmaddasp       vs1,    vs35,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+       xsmulsp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+       xsmaddasp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+       xsmaddasp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+       xsmaddasp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+       xsmulsp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+       xsmaddasp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xvmulsp         vs0,    vs32,   alpha_vr
+
+#else
+
+       xvmaddasp       vs0,    vs32,   alpha_vr
+
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddasp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmulsp         vs32,   vs0,    vs8
+       xsmulsp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddasp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+       xsmulsp         vs1,    vs33,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+       xsmaddasp       vs1,    vs33,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmulsp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddasp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+       xsmulsp         vs0,    vs32,   alpha_r
+
+#else
+
+       xsmaddasp       vs0,    vs32,   alpha_r
+
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
new file mode 100644 (file)
index 0000000..5b1c5ca
--- /dev/null
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO  304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO  232(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r7
+#define OFFSET r6
+#else
+#define A      r7
+#define        B       r8
+#define        C       r9
+#define        LDC     r10
+#define OFFSET r6
+#endif
+#endif
+
+#define alpha_r vs30
+#define alpha_vr vs31
+
+#define o0     0
+
+#define o12    r14
+#define o4     r15
+#define K1     r16
+#define o8     r17
+#define L      r18
+#define T1     r19
+#define KK     r20
+#define KKK    21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define        o48     r29
+
+#define PRE    r30
+#define T2     r31
+
+#include "sgemm_macros_16x8_power8.S"
+
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+       std     r14,  280(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+       stw     r14,  212(SP)
+#endif
+
+       // stfd f1,  ALPHA_SP
+       // stw  r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+       slwi    LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+        mr      KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        neg     KK, KK
+#endif
+
+
+       cmpwi   cr0, M, 0
+       ble     .L999_H1
+       cmpwi   cr0, N, 0
+       ble     .L999_H1
+       cmpwi   cr0, K, 0
+       ble     .L999_H1
+
+       li      PRE, 256 
+       li      o4 , 4
+       li      o8 , 8
+       li      o12, 12
+       li      o16, 16
+       li      o32, 32
+       li      o48, 48
+
+        addi    T1, SP, 300
+        stfs    f1, 0(T1)
+        stfs    f1, 4(T1)
+        stfs    f1, 8(T1)
+        stfs    f1,12(T1)
+
+        lxsspx  vs28, 0, T1
+
+        xxspltw alpha_r, vs28 , 0 
+        lxvw4x  alpha_vr, 0, T1
+
+
+
+#include "strmm_logic_16x8_power8.S"
+
+.L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+       ld      r14,  280(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+       lwz     r14,  212(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S
new file mode 100644 (file)
index 0000000..0d6d048
--- /dev/null
@@ -0,0 +1,2969 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+**************************************************************************************/
+
+
+
+       srawi.          J,      N,      3
+       ble             .LSTRMM_L8_END
+
+.LSTRMM_L8_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       3
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             .LSTRMM_L8x16_END
+
+.LSTRMM_L8x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     8                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L8x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L8x16_SUB4
+
+.LSTRMM_L8x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_I1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L8x16_LOOP_END
+
+       .align 5
+
+.LSTRMM_L8x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x16_LOOP
+
+.LSTRMM_L8x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       dcbt            AO,     PRE
+       KERNEL8x16_2
+       dcbt            AO,     PRE
+       KERNEL8x16_1
+       KERNEL8x16_E2
+
+       b               .LSTRMM_L8x16_SUB1
+
+.LSTRMM_L8x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL8x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL8x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL8x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL8x16_SUB1
+
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+       KERNEL8x16_SUB1
+
+       b               .LSTRMM_L8x16_SUB1
+
+.LSTRMM_L8x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL8x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L8x16_SAVE
+       b               .LSTRMM_L8x16_SUB2
+
+.LSTRMM_L8x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L8x16_SAVE
+
+.LSTRMM_L8x16_SUB2:
+
+       KERNEL8x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x16_SUB2
+
+.LSTRMM_L8x16_SAVE:
+
+       SAVE8x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LSTRMM_L8x16_BEGIN
+
+.LSTRMM_L8x16_END:
+
+.LSTRMM_L8x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             .LSTRMM_L8x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSTRMM_L8x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     8                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L8x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L8x8_SUB4
+
+.LSTRMM_L8x8_LOOP_START:
+
+       LOAD8x8_1
+       KERNEL8x8_I1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L8x8_LOOP_END
+
+       .align 5
+
+.LSTRMM_L8x8_LOOP:
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x8_LOOP
+
+.LSTRMM_L8x8_LOOP_END:
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_2
+
+       KERNEL8x8_1
+       KERNEL8x8_2
+       KERNEL8x8_1
+       KERNEL8x8_E2
+
+       b               .LSTRMM_L8x8_SUB1
+
+.LSTRMM_L8x8_SUB4:
+
+       KERNEL8x8_SUBI1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+       KERNEL8x8_SUB1
+
+       b               .LSTRMM_L8x8_SUB1
+
+.LSTRMM_L8x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL8x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L8x8_SAVE
+       b               .LSTRMM_L8x8_SUB2
+
+.LSTRMM_L8x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L8x8_SAVE
+
+.LSTRMM_L8x8_SUB2:
+
+       KERNEL8x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x8_SUB2
+
+.LSTRMM_L8x8_SAVE:
+
+       SAVE8x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x8_END:
+
+.LSTRMM_L8x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             .LSTRMM_L8x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     8                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L8x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L8x4_SUB4
+
+.LSTRMM_L8x4_LOOP_START:
+
+       LOAD8x4_1
+       KERNEL8x4_I1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L8x4_LOOP_END
+
+       .align 5
+
+.LSTRMM_L8x4_LOOP:
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x4_LOOP
+
+.LSTRMM_L8x4_LOOP_END:
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_2
+
+       KERNEL8x4_1
+       KERNEL8x4_2
+       KERNEL8x4_1
+       KERNEL8x4_E2
+
+       b               .LSTRMM_L8x4_SUB1
+
+.LSTRMM_L8x4_SUB4:
+
+       KERNEL8x4_SUBI1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+       KERNEL8x4_SUB1
+
+       b               .LSTRMM_L8x4_SUB1
+
+.LSTRMM_L8x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL8x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L8x4_SAVE
+       b               .LSTRMM_L8x4_SUB2
+
+.LSTRMM_L8x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L8x4_SAVE
+
+.LSTRMM_L8x4_SUB2:
+
+       KERNEL8x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x4_SUB2
+
+.LSTRMM_L8x4_SAVE:
+
+       SAVE8x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x4_END:
+
+.LSTRMM_L8x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LSTRMM_L8x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     8                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L8x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L8x2_SUB4
+
+.LSTRMM_L8x2_LOOP_START:
+
+       LOAD8x2_1
+       KERNEL8x2_I1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L8x2_LOOP_END
+
+       .align 5
+
+.LSTRMM_L8x2_LOOP:
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x2_LOOP
+
+.LSTRMM_L8x2_LOOP_END:
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_2
+
+       KERNEL8x2_1
+       KERNEL8x2_2
+       KERNEL8x2_1
+       KERNEL8x2_E2
+
+       b               .LSTRMM_L8x2_SUB1
+
+.LSTRMM_L8x2_SUB4:
+
+       KERNEL8x2_SUBI1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+       KERNEL8x2_SUB1
+
+       b               .LSTRMM_L8x2_SUB1
+
+.LSTRMM_L8x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL8x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L8x2_SAVE
+       b               .LSTRMM_L8x2_SUB2
+
+.LSTRMM_L8x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L8x2_SAVE
+
+.LSTRMM_L8x2_SUB2:
+
+       KERNEL8x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x2_SUB2
+
+.LSTRMM_L8x2_SAVE:
+
+       SAVE8x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x2_END:
+
+.LSTRMM_L8x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LSTRMM_L8x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     2                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     8                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L8x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L8x1_SUB4
+
+.LSTRMM_L8x1_LOOP_START:
+
+       LOAD8x1_1
+       KERNEL8x1_I1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L8x1_LOOP_END
+
+       .align 5
+
+.LSTRMM_L8x1_LOOP:
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x1_LOOP
+
+.LSTRMM_L8x1_LOOP_END:
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_2
+
+       KERNEL8x1_1
+       KERNEL8x1_2
+       KERNEL8x1_1
+       KERNEL8x1_E2
+
+       b               .LSTRMM_L8x1_SUB1
+
+.LSTRMM_L8x1_SUB4:
+
+       KERNEL8x1_SUBI1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+       KERNEL8x1_SUB1
+
+       b               .LSTRMM_L8x1_SUB1
+
+.LSTRMM_L8x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL8x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L8x1_SAVE
+       b               .LSTRMM_L8x1_SUB2
+
+.LSTRMM_L8x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L8x1_SAVE
+
+.LSTRMM_L8x1_SUB2:
+
+       KERNEL8x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L8x1_SUB2
+
+.LSTRMM_L8x1_SAVE:
+
+       SAVE8x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     2                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     8                                       // KK += Number of values in B
+#endif
+
+
+       addic.          J,      J,      -1
+       bgt             .LSTRMM_L8_BEGIN
+
+       andi.           T2,     N,      7
+       ble             .L999
+
+.LSTRMM_L8_END:
+
+       b               .LSTRMM_L4_BEGIN
+
+.L999_H1:
+
+       b               .L999
+
+.LSTRMM_L4_BEGIN:
+
+       andi.           T1,     N,      4
+       ble             .LSTRMM_L4_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       2
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             .LSTRMM_L4x16_END
+
+.LSTRMM_L4x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L4x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L4x16_SUB4
+
+.LSTRMM_L4x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_I1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L4x16_LOOP_END
+
+       .align 5
+
+.LSTRMM_L4x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x16_LOOP
+
+.LSTRMM_L4x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       dcbt            AO,     PRE
+       KERNEL4x16_2
+       dcbt            AO,     PRE
+       KERNEL4x16_1
+       KERNEL4x16_E2
+
+       b               .LSTRMM_L4x16_SUB1
+
+.LSTRMM_L4x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL4x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL4x16_SUB1
+
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+       KERNEL4x16_SUB1
+
+       b               .LSTRMM_L4x16_SUB1
+
+.LSTRMM_L4x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L4x16_SAVE
+       b               .LSTRMM_L4x16_SUB2
+
+.LSTRMM_L4x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L4x16_SAVE
+
+.LSTRMM_L4x16_SUB2:
+
+       KERNEL4x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x16_SUB2
+
+.LSTRMM_L4x16_SAVE:
+
+       SAVE4x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LSTRMM_L4x16_BEGIN
+
+.LSTRMM_L4x16_END:
+
+.LSTRMM_L4x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             .LSTRMM_L4x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSTRMM_L4x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L4x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L4x8_SUB4
+
+.LSTRMM_L4x8_LOOP_START:
+
+       LOAD4x8_1
+       KERNEL4x8_I1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L4x8_LOOP_END
+
+       .align 5
+
+.LSTRMM_L4x8_LOOP:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x8_LOOP
+
+.LSTRMM_L4x8_LOOP_END:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_E2
+
+       b               .LSTRMM_L4x8_SUB1
+
+.LSTRMM_L4x8_SUB4:
+
+       KERNEL4x8_SUBI1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       b               .LSTRMM_L4x8_SUB1
+
+.LSTRMM_L4x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L4x8_SAVE
+       b               .LSTRMM_L4x8_SUB2
+
+.LSTRMM_L4x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L4x8_SAVE
+
+.LSTRMM_L4x8_SUB2:
+
+       KERNEL4x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x8_SUB2
+
+.LSTRMM_L4x8_SAVE:
+
+       SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x8_END:
+
+.LSTRMM_L4x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             .LSTRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L4x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L4x4_SUB4
+
+.LSTRMM_L4x4_LOOP_START:
+
+       LOAD4x4_1
+       KERNEL4x4_I1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L4x4_LOOP_END
+
+       .align 5
+
+.LSTRMM_L4x4_LOOP:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x4_LOOP
+
+.LSTRMM_L4x4_LOOP_END:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_E2
+
+       b               .LSTRMM_L4x4_SUB1
+
+.LSTRMM_L4x4_SUB4:
+
+       KERNEL4x4_SUBI1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       b               .LSTRMM_L4x4_SUB1
+
+.LSTRMM_L4x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L4x4_SAVE
+       b               .LSTRMM_L4x4_SUB2
+
+.LSTRMM_L4x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L4x4_SAVE
+
+.LSTRMM_L4x4_SUB2:
+
+       KERNEL4x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x4_SUB2
+
+.LSTRMM_L4x4_SAVE:
+
+       SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x4_END:
+
+.LSTRMM_L4x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LSTRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L4x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L4x2_SUB4
+
+.LSTRMM_L4x2_LOOP_START:
+
+       LOAD4x2_1
+       KERNEL4x2_I1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L4x2_LOOP_END
+
+       .align 5
+
+.LSTRMM_L4x2_LOOP:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x2_LOOP
+
+.LSTRMM_L4x2_LOOP_END:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_E2
+
+       b               .LSTRMM_L4x2_SUB1
+
+.LSTRMM_L4x2_SUB4:
+
+       KERNEL4x2_SUBI1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       b               .LSTRMM_L4x2_SUB1
+
+.LSTRMM_L4x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L4x2_SAVE
+       b               .LSTRMM_L4x2_SUB2
+
+.LSTRMM_L4x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L4x2_SAVE
+
+.LSTRMM_L4x2_SUB2:
+
+       KERNEL4x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x2_SUB2
+
+.LSTRMM_L4x2_SAVE:
+
+       SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x2_END:
+
+.LSTRMM_L4x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LSTRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     2                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L4x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L4x1_SUB4
+
+.LSTRMM_L4x1_LOOP_START:
+
+       LOAD4x1_1
+       KERNEL4x1_I1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L4x1_LOOP_END
+
+       .align 5
+
+.LSTRMM_L4x1_LOOP:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x1_LOOP
+
+.LSTRMM_L4x1_LOOP_END:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_E2
+
+       b               .LSTRMM_L4x1_SUB1
+
+.LSTRMM_L4x1_SUB4:
+
+       KERNEL4x1_SUBI1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       b               .LSTRMM_L4x1_SUB1
+
+.LSTRMM_L4x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L4x1_SAVE
+       b               .LSTRMM_L4x1_SUB2
+
+.LSTRMM_L4x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L4x1_SAVE
+
+.LSTRMM_L4x1_SUB2:
+
+       KERNEL4x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L4x1_SUB2
+
+.LSTRMM_L4x1_SAVE:
+
+       SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     2                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x1_END:
+
+       slwi            T1,     K,      4
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     4                                       // KK += Number of values in B
+#endif
+
+
+.LSTRMM_L4_END:
+.LSTRMM_L2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             .LSTRMM_L2_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             .LSTRMM_L2x16_END
+
+.LSTRMM_L2x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L2x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L2x16_SUB4
+
+.LSTRMM_L2x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_I1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L2x16_LOOP_END
+
+       .align 5
+
+.LSTRMM_L2x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x16_LOOP
+
+.LSTRMM_L2x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       dcbt            AO,     PRE
+       KERNEL2x16_2
+       dcbt            AO,     PRE
+       KERNEL2x16_1
+       KERNEL2x16_E2
+
+       b               .LSTRMM_L2x16_SUB1
+
+.LSTRMM_L2x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL2x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL2x16_SUB1
+
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+       KERNEL2x16_SUB1
+
+       b               .LSTRMM_L2x16_SUB1
+
+.LSTRMM_L2x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L2x16_SAVE
+       b               .LSTRMM_L2x16_SUB2
+
+.LSTRMM_L2x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L2x16_SAVE
+
+.LSTRMM_L2x16_SUB2:
+
+       KERNEL2x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x16_SUB2
+
+.LSTRMM_L2x16_SAVE:
+
+       SAVE2x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LSTRMM_L2x16_BEGIN
+
+.LSTRMM_L2x16_END:
+
+.LSTRMM_L2x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             .LSTRMM_L2x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSTRMM_L2x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L2x8_SUB4
+
+.LSTRMM_L2x8_LOOP_START:
+
+       LOAD2x8_1
+       KERNEL2x8_I1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L2x8_LOOP_END
+
+       .align 5
+
+.LSTRMM_L2x8_LOOP:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x8_LOOP
+
+.LSTRMM_L2x8_LOOP_END:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               .LSTRMM_L2x8_SUB1
+
+.LSTRMM_L2x8_SUB4:
+
+       KERNEL2x8_SUBI1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               .LSTRMM_L2x8_SUB1
+
+.LSTRMM_L2x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L2x8_SAVE
+       b               .LSTRMM_L2x8_SUB2
+
+.LSTRMM_L2x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L2x8_SAVE
+
+.LSTRMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x8_SUB2
+
+.LSTRMM_L2x8_SAVE:
+
+       SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x8_END:
+
+.LSTRMM_L2x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             .LSTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L2x4_SUB4
+
+.LSTRMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L2x4_LOOP_END
+
+       .align 5
+
+.LSTRMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x4_LOOP
+
+.LSTRMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               .LSTRMM_L2x4_SUB1
+
+.LSTRMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               .LSTRMM_L2x4_SUB1
+
+.LSTRMM_L2x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L2x4_SAVE
+       b               .LSTRMM_L2x4_SUB2
+
+.LSTRMM_L2x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L2x4_SAVE
+
+.LSTRMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x4_SUB2
+
+.LSTRMM_L2x4_SAVE:
+
+       SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x4_END:
+
+.LSTRMM_L2x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LSTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L2x2_SUB4
+
+.LSTRMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L2x2_LOOP_END
+
+       .align 5
+
+.LSTRMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x2_LOOP
+
+.LSTRMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               .LSTRMM_L2x2_SUB1
+
+.LSTRMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               .LSTRMM_L2x2_SUB1
+
+.LSTRMM_L2x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L2x2_SAVE
+       b               .LSTRMM_L2x2_SUB2
+
+.LSTRMM_L2x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L2x2_SAVE
+
+.LSTRMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x2_SUB2
+
+.LSTRMM_L2x2_SAVE:
+
+       SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x2_END:
+
+.LSTRMM_L2x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LSTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     2                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L2x1_SUB4
+
+.LSTRMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L2x1_LOOP_END
+
+       .align 5
+
+.LSTRMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x1_LOOP
+
+.LSTRMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               .LSTRMM_L2x1_SUB1
+
+.LSTRMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               .LSTRMM_L2x1_SUB1
+
+.LSTRMM_L2x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L2x1_SAVE
+       b               .LSTRMM_L2x1_SUB2
+
+.LSTRMM_L2x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L2x1_SAVE
+
+.LSTRMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L2x1_SUB2
+
+.LSTRMM_L2x1_SAVE:
+
+       SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     2                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x1_END:
+
+       slwi            T1,     K,      3
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     2                                       // KK += Number of values in B
+#endif
+
+
+.LSTRMM_L2_END:
+.LSTRMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             .LSTRMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      4
+       ble             .LSTRMM_L1x16_END
+
+.LSTRMM_L1x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     2                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     16                              // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L1x16_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L1x16_SUB4
+
+.LSTRMM_L1x16_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_I1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L1x16_LOOP_END
+
+       .align 5
+
+.LSTRMM_L1x16_LOOP:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x16_LOOP
+
+.LSTRMM_L1x16_LOOP_END:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       dcbt            AO,     PRE
+       KERNEL1x16_2
+       dcbt            AO,     PRE
+       KERNEL1x16_1
+       KERNEL1x16_E2
+
+       b               .LSTRMM_L1x16_SUB1
+
+.LSTRMM_L1x16_SUB4:
+
+       dcbt            AO,     PRE
+       KERNEL1x16_SUBI1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+       dcbt            AO,     PRE
+       KERNEL1x16_SUB1
+
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+       KERNEL1x16_SUB1
+
+       b               .LSTRMM_L1x16_SUB1
+
+.LSTRMM_L1x16_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x16_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L1x16_SAVE
+       b               .LSTRMM_L1x16_SUB2
+
+.LSTRMM_L1x16_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L1x16_SAVE
+
+.LSTRMM_L1x16_SUB2:
+
+       KERNEL1x16_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x16_SUB2
+
+.LSTRMM_L1x16_SAVE:
+
+       SAVE1x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     2                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     16                              // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LSTRMM_L1x16_BEGIN
+
+.LSTRMM_L1x16_END:
+
+.LSTRMM_L1x8_BEGIN:
+       andi.           T2,     M,      15
+       ble             .LSTRMM_L1x1_END
+
+       andi.           T1,     M,      8
+       ble             .LSTRMM_L1x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     2                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L1x8_SUB4
+
+.LSTRMM_L1x8_LOOP_START:
+
+       LOAD1x8_1
+       KERNEL1x8_I1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L1x8_LOOP_END
+
+       .align 5
+
+.LSTRMM_L1x8_LOOP:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x8_LOOP
+
+.LSTRMM_L1x8_LOOP_END:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               .LSTRMM_L1x8_SUB1
+
+.LSTRMM_L1x8_SUB4:
+
+       KERNEL1x8_SUBI1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               .LSTRMM_L1x8_SUB1
+
+.LSTRMM_L1x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L1x8_SAVE
+       b               .LSTRMM_L1x8_SUB2
+
+.LSTRMM_L1x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L1x8_SAVE
+
+.LSTRMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x8_SUB2
+
+.LSTRMM_L1x8_SAVE:
+
+       SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     2                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x8_END:
+
+.LSTRMM_L1x4_BEGIN:
+
+       andi.           T1,     M,      4
+       ble             .LSTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     2                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L1x4_SUB4
+
+.LSTRMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L1x4_LOOP_END
+
+       .align 5
+
+.LSTRMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x4_LOOP
+
+.LSTRMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               .LSTRMM_L1x4_SUB1
+
+.LSTRMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               .LSTRMM_L1x4_SUB1
+
+.LSTRMM_L1x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L1x4_SAVE
+       b               .LSTRMM_L1x4_SUB2
+
+.LSTRMM_L1x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L1x4_SAVE
+
+.LSTRMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x4_SUB2
+
+.LSTRMM_L1x4_SAVE:
+
+       SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     2                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x4_END:
+
+.LSTRMM_L1x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LSTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     2                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L1x2_SUB4
+
+.LSTRMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L1x2_LOOP_END
+
+       .align 5
+
+.LSTRMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x2_LOOP
+
+.LSTRMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               .LSTRMM_L1x2_SUB1
+
+.LSTRMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               .LSTRMM_L1x2_SUB1
+
+.LSTRMM_L1x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L1x2_SAVE
+       b               .LSTRMM_L1x2_SUB2
+
+.LSTRMM_L1x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L1x2_SAVE
+
+.LSTRMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x2_SUB2
+
+.LSTRMM_L1x2_SAVE:
+
+       SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     2                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x2_END:
+
+.LSTRMM_L1x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LSTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     2                               // Number of values in B shifted
+       slwi            T2,     KK,     2                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LSTRMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LSTRMM_L1x1_SUB4
+
+.LSTRMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             .LSTRMM_L1x1_LOOP_END
+
+       .align 5
+
+.LSTRMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x1_LOOP
+
+.LSTRMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               .LSTRMM_L1x1_SUB1
+
+.LSTRMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               .LSTRMM_L1x1_SUB1
+
+.LSTRMM_L1x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LSTRMM_L1x1_SAVE
+       b               .LSTRMM_L1x1_SUB2
+
+.LSTRMM_L1x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LSTRMM_L1x1_SAVE
+
+.LSTRMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LSTRMM_L1x1_SUB2
+
+.LSTRMM_L1x1_SAVE:
+
+       SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     2                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     2                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x1_END:
+
+#if !defined(LEFT)
+       addi            KK,     KK,     1                                       // KK += Number of values in B
+#endif
+
+
+.LSTRMM_L1_END:
diff --git a/param.h b/param.h
index 31125d8..f5d1ab2 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1961,15 +1961,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(POWER8)
 
-#define SNUMOPT                4
+#define SNUMOPT                16
 #define DNUMOPT                8
 
 #define GEMM_DEFAULT_OFFSET_A  384
 #define GEMM_DEFAULT_OFFSET_B 1024
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define SGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
 #define DGEMM_DEFAULT_UNROLL_M 16
 #define DGEMM_DEFAULT_UNROLL_N 4
 #define CGEMM_DEFAULT_UNROLL_M 2
@@ -1977,12 +1977,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P  992
+#define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
 #define CGEMM_DEFAULT_P  488
 #define ZGEMM_DEFAULT_P  240
 
-#define SGEMM_DEFAULT_Q  504
+#define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
 #define CGEMM_DEFAULT_Q  400
 #define ZGEMM_DEFAULT_Q  360