add optimized cgemm- and ctrmm-kernel for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Fri, 18 Mar 2016 07:17:25 +0000 (08:17 +0100)
committerWerner Saar <wernsaar@googlemail.com>
Fri, 18 Mar 2016 07:17:25 +0000 (08:17 +0100)
kernel/power/KERNEL.POWER8
kernel/power/cgemm_kernel_8x4_power8.S [new file with mode: 0644]
kernel/power/cgemm_logic_8x4_power8.S [new file with mode: 0644]
kernel/power/cgemm_macros_8x4_power8.S [new file with mode: 0644]
kernel/power/ctrmm_kernel_8x4_power8.S [new file with mode: 0644]
kernel/power/ctrmm_logic_8x4_power8.S [new file with mode: 0644]
param.h

index f8be1d4..eaa9f26 100644 (file)
@@ -5,7 +5,7 @@
 
 STRMMKERNEL    = strmm_kernel_16x8_power8.S
 DTRMMKERNEL    = dtrmm_kernel_16x4_power8.S
-CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
+CTRMMKERNEL    = ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL    = ztrmm_kernel_8x2_power8.S
 
 SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
@@ -28,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
 CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMINCOPYOBJ =  cgemm_incopy.o
+CGEMMITCOPYOBJ =  cgemm_itcopy.o
 
 ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
new file mode 100644 (file)
index 0000000..f732c81
--- /dev/null
@@ -0,0 +1,375 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 400
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO  312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO  240(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r10
+#define        B       r6
+#define        C       r7
+#define        LDC     r8
+#define OFFSET r9
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0     0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define TBUFFER        r14
+#define L      r15
+#define o12    r16
+#define o4     r17
+#define T2     r19
+#define KK     r20
+#define        o8      r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o16    r27
+#define        o32     r28
+#define o48    r29
+
+#define PRE    r30
+#define T1     r31
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+       std     r14,  280(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+       stw     r14,  212(SP)
+#endif
+
+       stfs    f1,  ALPHA_R_SP
+       stfs    f2,  ALPHA_I_SP
+       // stw  r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     B,   FRAMESLOT(0) + STACKSIZE(SP)
+       lwz     C,   FRAMESLOT(1) + STACKSIZE(SP)
+       lwz     LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       neg     KK, OFFSET
+#endif
+#endif
+
+#include "cgemm_macros_8x4_power8.S"
+
+       cmpwi   cr0, M, 0
+       ble     .L999_H1
+       cmpwi   cr0, N, 0
+       ble     .L999_H1
+       cmpwi   cr0, K, 0
+       ble     .L999_H1
+
+       slwi    LDC, LDC, ZBASE_SHIFT
+       li      PRE, 256 
+       li      o4  , 4
+       li      o8  , 8
+       li      o12 , 12
+       li      o16 , 16
+       li      o32 , 32
+       li      o48 , 48
+       addi    TBUFFER, SP, 360
+       
+
+#ifdef __64BIT__
+       addi    T1 , SP, 296
+#else
+       addi    T1 , SP, 224
+#endif
+
+       lxsspx  alpha_r, 0, T1
+       lxsspx  alpha_i, o8, T1
+
+       .align 5
+
+#include "cgemm_logic_8x4_power8.S"
+
+.L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+       ld      r14,  280(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+       lwz     r14,  212(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S
new file mode 100644 (file)
index 0000000..51a0631
--- /dev/null
@@ -0,0 +1,1342 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+
+       srawi.          J,      N,      2
+       ble             .LCGEMM_L4_END
+
+.LCGEMM_L4_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       2
+       add             C,      C,      T1
+       srawi.          I,      M,      3
+       ble             .LCGEMM_L4x8_END
+
+.LCGEMM_L4x8_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L4x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L4x8_SUB4
+
+.LCGEMM_L4x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD4x8_1
+       KERNEL4x8_I1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L4x8_LOOP_END
+
+       .align 5
+
+.LCGEMM_L4x8_LOOP:
+
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x8_LOOP
+
+.LCGEMM_L4x8_LOOP_END:
+
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       dcbt            AO,     PRE
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_E2
+
+       b               .LCGEMM_L4x8_SUB1
+
+.LCGEMM_L4x8_SUB4:
+
+       KERNEL4x8_SUBI1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       b               .LCGEMM_L4x8_SUB1
+
+.LCGEMM_L4x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L4x8_SAVE
+       b               .LCGEMM_L4x8_SUB2
+
+.LCGEMM_L4x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L4x8_SAVE
+
+.LCGEMM_L4x8_SUB2:
+
+       KERNEL4x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x8_SUB2
+
+.LCGEMM_L4x8_SAVE:
+
+       SAVE4x8
+
+       addic.          I,      I,      -1
+       bgt             .LCGEMM_L4x8_BEGIN
+
+.LCGEMM_L4x8_END:
+
+.LCGEMM_L4x4_BEGIN:
+
+       andi.           T2,     M,      7
+       ble             .LCGEMM_L4x1_END
+
+       andi.           T1,     M,      4
+       ble             .LCGEMM_L4x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L4x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L4x4_SUB4
+
+.LCGEMM_L4x4_LOOP_START:
+
+       LOAD4x4_1
+       KERNEL4x4_I1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L4x4_LOOP_END
+
+       .align 5
+
+.LCGEMM_L4x4_LOOP:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x4_LOOP
+
+.LCGEMM_L4x4_LOOP_END:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_E2
+
+       b               .LCGEMM_L4x4_SUB1
+
+.LCGEMM_L4x4_SUB4:
+
+       KERNEL4x4_SUBI1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       b               .LCGEMM_L4x4_SUB1
+
+.LCGEMM_L4x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L4x4_SAVE
+       b               .LCGEMM_L4x4_SUB2
+
+.LCGEMM_L4x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L4x4_SAVE
+
+.LCGEMM_L4x4_SUB2:
+
+       KERNEL4x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x4_SUB2
+
+.LCGEMM_L4x4_SAVE:
+
+       SAVE4x4
+
+.LCGEMM_L4x4_END:
+
+.LCGEMM_L4x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LCGEMM_L4x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L4x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L4x2_SUB4
+
+.LCGEMM_L4x2_LOOP_START:
+
+       LOAD4x2_1
+       KERNEL4x2_I1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L4x2_LOOP_END
+
+       .align 5
+
+.LCGEMM_L4x2_LOOP:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x2_LOOP
+
+.LCGEMM_L4x2_LOOP_END:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_E2
+
+       b               .LCGEMM_L4x2_SUB1
+
+.LCGEMM_L4x2_SUB4:
+
+       KERNEL4x2_SUBI1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       b               .LCGEMM_L4x2_SUB1
+
+.LCGEMM_L4x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L4x2_SAVE
+       b               .LCGEMM_L4x2_SUB2
+
+.LCGEMM_L4x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L4x2_SAVE
+
+.LCGEMM_L4x2_SUB2:
+
+       KERNEL4x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x2_SUB2
+
+.LCGEMM_L4x2_SAVE:
+
+       SAVE4x2
+
+.LCGEMM_L4x2_END:
+
+.LCGEMM_L4x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LCGEMM_L4x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L4x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L4x1_SUB4
+
+.LCGEMM_L4x1_LOOP_START:
+
+       LOAD4x1_1
+       KERNEL4x1_I1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L4x1_LOOP_END
+
+       .align 5
+
+.LCGEMM_L4x1_LOOP:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x1_LOOP
+
+.LCGEMM_L4x1_LOOP_END:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_E2
+
+       b               .LCGEMM_L4x1_SUB1
+
+.LCGEMM_L4x1_SUB4:
+
+       KERNEL4x1_SUBI1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       b               .LCGEMM_L4x1_SUB1
+
+.LCGEMM_L4x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL4x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L4x1_SAVE
+       b               .LCGEMM_L4x1_SUB2
+
+.LCGEMM_L4x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L4x1_SAVE
+
+.LCGEMM_L4x1_SUB2:
+
+       KERNEL4x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L4x1_SUB2
+
+.LCGEMM_L4x1_SAVE:
+
+       SAVE4x1
+
+.LCGEMM_L4x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+       addic.          J,      J,      -1
+       bgt             .LCGEMM_L4_BEGIN
+
+       andi.           T2,     N,      3
+       ble             .L999_H2
+
+.LCGEMM_L4_END:
+
+       b               .LCGEMM_L2_BEGIN
+
+.L999_H1:
+
+       b               .L999_H2
+
+.LCGEMM_L2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             .LCGEMM_L2_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+       srawi.          I,      M,      3
+       ble             .LCGEMM_L2x8_END
+
+.LCGEMM_L2x8_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L2x8_SUB4
+
+.LCGEMM_L2x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD2x8_1
+       KERNEL2x8_I1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L2x8_LOOP_END
+
+       .align 5
+
+.LCGEMM_L2x8_LOOP:
+
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x8_LOOP
+
+.LCGEMM_L2x8_LOOP_END:
+
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       dcbt            AO,     PRE
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               .LCGEMM_L2x8_SUB1
+
+.LCGEMM_L2x8_SUB4:
+
+       KERNEL2x8_SUBI1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               .LCGEMM_L2x8_SUB1
+
+.LCGEMM_L2x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L2x8_SAVE
+       b               .LCGEMM_L2x8_SUB2
+
+.LCGEMM_L2x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L2x8_SAVE
+
+.LCGEMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x8_SUB2
+
+.LCGEMM_L2x8_SAVE:
+
+       SAVE2x8
+
+       addic.          I,      I,      -1
+       bgt             .LCGEMM_L2x8_BEGIN
+
+.LCGEMM_L2x8_END:
+
+.LCGEMM_L2x4_BEGIN:
+
+       andi.           T2,     M,      7
+       ble             .LCGEMM_L2x1_END
+
+       andi.           T1,     M,      4
+       ble             .LCGEMM_L2x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L2x4_SUB4
+
+.LCGEMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L2x4_LOOP_END
+
+       .align 5
+
+.LCGEMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x4_LOOP
+
+.LCGEMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               .LCGEMM_L2x4_SUB1
+
+.LCGEMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               .LCGEMM_L2x4_SUB1
+
+.LCGEMM_L2x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L2x4_SAVE
+       b               .LCGEMM_L2x4_SUB2
+
+.LCGEMM_L2x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L2x4_SAVE
+
+.LCGEMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x4_SUB2
+
+.LCGEMM_L2x4_SAVE:
+
+       SAVE2x4
+
+.LCGEMM_L2x4_END:
+
+.LCGEMM_L2x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LCGEMM_L2x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L2x2_SUB4
+
+.LCGEMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L2x2_LOOP_END
+
+       .align 5
+
+.LCGEMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x2_LOOP
+
+.LCGEMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               .LCGEMM_L2x2_SUB1
+
+.LCGEMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               .LCGEMM_L2x2_SUB1
+
+.LCGEMM_L2x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L2x2_SAVE
+       b               .LCGEMM_L2x2_SUB2
+
+.LCGEMM_L2x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L2x2_SAVE
+
+.LCGEMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x2_SUB2
+
+.LCGEMM_L2x2_SAVE:
+
+       SAVE2x2
+
+.LCGEMM_L2x2_END:
+
+.LCGEMM_L2x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LCGEMM_L2x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L2x1_SUB4
+
+.LCGEMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L2x1_LOOP_END
+
+       .align 5
+
+.LCGEMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x1_LOOP
+
+.LCGEMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               .LCGEMM_L2x1_SUB1
+
+.LCGEMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               .LCGEMM_L2x1_SUB1
+
+.LCGEMM_L2x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L2x1_SAVE
+       b               .LCGEMM_L2x1_SUB2
+
+.LCGEMM_L2x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L2x1_SAVE
+
+.LCGEMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L2x1_SUB2
+
+.LCGEMM_L2x1_SAVE:
+
+       SAVE2x1
+
+.LCGEMM_L2x1_END:
+
+       slwi            T1,     K,      4
+       add             B,      B,      T1
+
+.LCGEMM_L2_END:
+
+       b               .LCGEMM_L1_BEGIN
+
+.L999_H2:
+
+       b               .L999
+
+.LCGEMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             .LCGEMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+       srawi.          I,      M,      3
+       ble             .LCGEMM_L1x8_END
+
+.LCGEMM_L1x8_BEGIN:
+
+
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L1x8_SUB4
+
+.LCGEMM_L1x8_LOOP_START:
+
+       dcbt            AO,     PRE
+       LOAD1x8_1
+       KERNEL1x8_I1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L1x8_LOOP_END
+
+       .align 5
+
+.LCGEMM_L1x8_LOOP:
+
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x8_LOOP
+
+.LCGEMM_L1x8_LOOP_END:
+
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       dcbt            AO,     PRE
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               .LCGEMM_L1x8_SUB1
+
+.LCGEMM_L1x8_SUB4:
+
+       KERNEL1x8_SUBI1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               .LCGEMM_L1x8_SUB1
+
+.LCGEMM_L1x8_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L1x8_SAVE
+       b               .LCGEMM_L1x8_SUB2
+
+.LCGEMM_L1x8_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L1x8_SAVE
+
+.LCGEMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x8_SUB2
+
+.LCGEMM_L1x8_SAVE:
+
+       SAVE1x8
+
+       addic.          I,      I,      -1
+       bgt             .LCGEMM_L1x8_BEGIN
+
+.LCGEMM_L1x8_END:
+
+.LCGEMM_L1x4_BEGIN:
+
+       andi.           T2,     M,      7
+       ble             .LCGEMM_L1x1_END
+
+       andi.           T1,     M,      4
+       ble             .LCGEMM_L1x4_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L1x4_SUB4
+
+.LCGEMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L1x4_LOOP_END
+
+       .align 5
+
+.LCGEMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x4_LOOP
+
+.LCGEMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               .LCGEMM_L1x4_SUB1
+
+.LCGEMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               .LCGEMM_L1x4_SUB1
+
+.LCGEMM_L1x4_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L1x4_SAVE
+       b               .LCGEMM_L1x4_SUB2
+
+.LCGEMM_L1x4_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L1x4_SAVE
+
+.LCGEMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x4_SUB2
+
+.LCGEMM_L1x4_SAVE:
+
+       SAVE1x4
+
+.LCGEMM_L1x4_END:
+
+.LCGEMM_L1x2_BEGIN:
+
+
+       andi.           T1,     M,      2
+       ble             .LCGEMM_L1x2_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L1x2_SUB4
+
+.LCGEMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L1x2_LOOP_END
+
+       .align 5
+
+.LCGEMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x2_LOOP
+
+.LCGEMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               .LCGEMM_L1x2_SUB1
+
+.LCGEMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               .LCGEMM_L1x2_SUB1
+
+.LCGEMM_L1x2_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L1x2_SAVE
+       b               .LCGEMM_L1x2_SUB2
+
+.LCGEMM_L1x2_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L1x2_SAVE
+
+.LCGEMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x2_SUB2
+
+.LCGEMM_L1x2_SAVE:
+
+       SAVE1x2
+
+.LCGEMM_L1x2_END:
+
+.LCGEMM_L1x1_BEGIN:
+
+
+       andi.           T1,     M,      1
+       ble             .LCGEMM_L1x1_END
+       mr              BO,     B
+       srawi.          L,      K,      3
+       ble             .LCGEMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCGEMM_L1x1_SUB4
+
+.LCGEMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             .LCGEMM_L1x1_LOOP_END
+
+       .align 5
+
+.LCGEMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x1_LOOP
+
+.LCGEMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               .LCGEMM_L1x1_SUB1
+
+.LCGEMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               .LCGEMM_L1x1_SUB1
+
+.LCGEMM_L1x1_SUB0:
+
+       andi.           L,      K,      7
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCGEMM_L1x1_SAVE
+       b               .LCGEMM_L1x1_SUB2
+
+.LCGEMM_L1x1_SUB1:
+
+       andi.           L,      K,      7
+       ble             .LCGEMM_L1x1_SAVE
+
+.LCGEMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCGEMM_L1x1_SUB2
+
+.LCGEMM_L1x1_SAVE:
+
+       SAVE1x1
+
+.LCGEMM_L1x1_END:
+
+.LCGEMM_L1_END:
diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S
new file mode 100644 (file)
index 0000000..2085d37
--- /dev/null
@@ -0,0 +1,6713 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+       #define XSFADD_R1   xsaddsp
+       #define XSFADD_R2   xssubsp
+       #define XSFADD_I1   xsaddsp
+       #define XSFADD_I2   xsaddsp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+       #define XSFADD_R1   xsaddsp
+       #define XSFADD_R2   xsaddsp
+       #define XSFADD_I1   xssubsp
+       #define XSFADD_I2   xsaddsp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+       #define XSFADD_R1   xsaddsp
+       #define XSFADD_R2   xsaddsp
+       #define XSFADD_I1   xsaddsp
+       #define XSFADD_I2   xssubsp
+
+#else             // CC || CR || RC || RR
+
+       #define XSFADD_R1   xsaddsp
+       #define XSFADD_R2   xssubsp
+       #define XSFADD_I1   xssubsp
+       #define XSFADD_I2   xssubsp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       xvmaddasp       vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       addi            BO,     BO,     32
+       xvmaddasp       vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       addi            AO,     AO,     64
+       xvmaddasp       vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+.endm
+
+.macro KERNEL4x8_2
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs6,    vs20            // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs6,    vs21            // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs7,    vs20            // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs7,    vs21            // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       xvmaddasp       vs56,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       addi            AO,     AO,     64
+       xvmaddasp       vs60,   vs6,    vs22            // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs6,    vs23            // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       addi            BO,     BO,     32
+       xvmaddasp       vs62,   vs7,    vs22            // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs7,    vs23            // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs6,    vs20            // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs6,    vs21            // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs7,    vs20            // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs7,    vs21            // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs56,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs60,   vs6,    vs22            // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs6,    vs23            // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs62,   vs7,    vs22            // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs7,    vs23            // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x8
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs36,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs37,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs38,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs39,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs40,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs41,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs42,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs43,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs44,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs45,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs46,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs47,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs48,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs49,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs50,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs51,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs52,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs53,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs54,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs55,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs56,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs57,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs58,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs59,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs60,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs61,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs62,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs63,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x4
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs36,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs37,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs38,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs39,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs40,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs41,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs42,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs43,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs44,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs45,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs46,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs47,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x2
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs36,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs37,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs38,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs39,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs20,   o0,     T1              //  load b2_r
+       lxsspx          vs21,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs22,   o0,     T1              //  load b3_r
+       lxsspx          vs23,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmulsp         vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmulsp         vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmulsp         vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmulsp         vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmulsp         vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmulsp         vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmulsp         vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmulsp         vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs20,   o0,     T1              //  load b2_r
+       lxsspx          vs21,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs22,   o0,     T1              //  load b3_r
+       lxsspx          vs23,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmaddasp       vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmaddasp       vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmaddasp       vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmaddasp       vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmaddasp       vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmaddasp       vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+       xsmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r
+       xsmaddasp       vs41,   vs5,    vs21            // a4_i*b2_i
+       xsmaddasp       vs42,   vs4,    vs21            // a4_r*b2_i
+       xsmaddasp       vs43,   vs5,    vs20            // a4_i*b2_r
+
+       xsmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r
+       xsmaddasp       vs45,   vs5,    vs23            // a4_i*b3_i
+       xsmaddasp       vs46,   vs4,    vs23            // a4_r*b3_i
+       xsmaddasp       vs47,   vs5,    vs22            // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+       xsmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r
+       xsmaddasp       vs41,   vs5,    vs21            // a4_i*b2_i
+       xsmaddasp       vs42,   vs4,    vs21            // a4_r*b2_i
+       xsmaddasp       vs43,   vs5,    vs20            // a4_i*b2_r
+
+       xsmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r
+       xsmaddasp       vs45,   vs5,    vs23            // a4_i*b3_i
+       xsmaddasp       vs46,   vs4,    vs23            // a4_r*b3_i
+       xsmaddasp       vs47,   vs5,    vs22            // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmulsp         vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmulsp         vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmulsp         vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmulsp         vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmulsp         vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmulsp         vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmulsp         vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmulsp         vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmaddasp       vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmaddasp       vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmaddasp       vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmaddasp       vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmaddasp       vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmaddasp       vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro SAVE4x1
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs32            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs35            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs36            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs39            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs37            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs38            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs40            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs43            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs41            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs42            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs44            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs47            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs45            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs46            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x8
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs36,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs37,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs38,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs39,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs40,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs41,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs42,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs43,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs44,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs45,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs46,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs47,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x4
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs36,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs37,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs38,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs39,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x2
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro SAVE2x1
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs32            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs35            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs36            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs39            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs37            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs38            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x8
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs36,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs37,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs38,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs39,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x4
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs34,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs35,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x2
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+       stxvw4x         vs32,   o0,     TBUFFER
+
+       lxsspx          vs8,    o0,     TBUFFER
+       lxsspx          vs9,    o4,     TBUFFER
+       lxsspx          vs10,   o8,     TBUFFER
+       lxsspx          vs11,   o12,    TBUFFER
+
+       stxvw4x         vs33,   o0,     TBUFFER
+
+       lxsspx          vs12,   o0,     TBUFFER
+       lxsspx          vs13,   o4,     TBUFFER
+       lxsspx          vs14,   o8,     TBUFFER
+       lxsspx          vs15,   o12,    TBUFFER
+
+       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
+       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
+       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
+       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+
+       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
+       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
+       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
+       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
+       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro SAVE1x1
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs32            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs35            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
+
+       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
+       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
+       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
+       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+
+       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsaddsp         vs0,    vs0,    vs20
+       xsaddsp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S
new file mode 100644 (file)
index 0000000..b154857
--- /dev/null
@@ -0,0 +1,385 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD   lwz
+#else
+#define LOAD   ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 400
+#define ALPHA_R_SP 304(SP)
+#define ALPHA_I_SP 312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO  240(SP)
+#endif
+
+#define        M       r3
+#define        N       r4
+#define        K       r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A      r6
+#define        B       r7
+#define        C       r8
+#define        LDC     r9
+#define OFFSET r10
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A      r10
+#define        B       r6
+#define        C       r7
+#define        LDC     r8
+#define OFFSET r9
+#else
+#define A      r8
+#define        B       r9
+#define        C       r10
+#define        LDC     r6
+#define OFFSET r7
+#endif
+#endif
+
+#define o0     0
+#define alpha_r vs30
+#define alpha_i vs31
+#define alpha_vr vs28
+#define alpha_vi vs29
+
+
+#define o12    r12
+#define KKK    r13
+#define K1     r14
+#define L      r15
+#define o16    r16
+#define TBUFFER        r17
+#define T2     r19
+#define KK     r20
+#define        o8      r21
+#define        I       r22
+#define J      r23
+#define AO     r24
+#define        BO      r25
+#define        CO      r26
+#define o4     r27
+#define        o32     r28
+#define o48    r29
+
+#define PRE    r30
+#define T1     r31
+
+#ifndef NEEDPARAM
+
+       PROLOGUE
+       PROFCODE
+
+       addi    SP, SP, -STACKSIZE
+       li      r0, 0
+
+       stfd    f14,    0(SP)
+       stfd    f15,    8(SP)
+       stfd    f16,   16(SP)
+       stfd    f17,   24(SP)
+
+       stfd    f18,   32(SP)
+       stfd    f19,   40(SP)
+       stfd    f20,   48(SP)
+       stfd    f21,   56(SP)
+
+       stfd    f22,   64(SP)
+       stfd    f23,   72(SP)
+       stfd    f24,   80(SP)
+       stfd    f25,   88(SP)
+
+       stfd    f26,   96(SP)
+       stfd    f27,  104(SP)
+       stfd    f28,  112(SP)
+       stfd    f29,  120(SP)
+
+       stfd    f30,  128(SP)
+       stfd    f31,  136(SP)
+
+#ifdef __64BIT__
+       std     r31,  144(SP)
+       std     r30,  152(SP)
+       std     r29,  160(SP)
+       std     r28,  168(SP)
+       std     r27,  176(SP)
+       std     r26,  184(SP)
+       std     r25,  192(SP)
+       std     r24,  200(SP)
+       std     r23,  208(SP)
+       std     r22,  216(SP)
+       std     r21,  224(SP)
+       std     r20,  232(SP)
+       std     r19,  240(SP)
+       std     r18,  248(SP)
+       std     r17,  256(SP)
+       std     r16,  264(SP)
+       std     r15,  272(SP)
+       std     r14,  280(SP)
+       std     r13,  288(SP)
+       std     r12,  296(SP)
+#else
+       stw     r31,  144(SP)
+       stw     r30,  148(SP)
+       stw     r29,  152(SP)
+       stw     r28,  156(SP)
+       stw     r27,  160(SP)
+       stw     r26,  164(SP)
+       stw     r25,  168(SP)
+       stw     r24,  172(SP)
+       stw     r23,  176(SP)
+       stw     r22,  180(SP)
+       stw     r21,  184(SP)
+       stw     r20,  188(SP)
+       stw     r19,  192(SP)
+       stw     r18,  196(SP)
+       stw     r17,  200(SP)
+       stw     r16,  204(SP)
+       stw     r15,  208(SP)
+       stw     r14,  212(SP)
+       stw     r13,  216(SP)
+#endif
+
+       stfs    f1,  ALPHA_R_SP
+       stfs    f2,  ALPHA_I_SP
+       // stw  r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     B,   FRAMESLOT(0) + STACKSIZE(SP)
+       lwz     C,   FRAMESLOT(1) + STACKSIZE(SP)
+       lwz     LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+       lwz     OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+       neg     KK, OFFSET
+#endif
+#endif
+
+#include "cgemm_macros_8x4_power8.S"
+
+       cmpwi   cr0, M, 0
+       ble     .L999_H1
+       cmpwi   cr0, N, 0
+       ble     .L999_H1
+       cmpwi   cr0, K, 0
+       ble     .L999_H1
+
+        slwi    LDC, LDC, ZBASE_SHIFT
+        li      PRE, 256
+        li      o4  , 4
+        li      o8  , 8
+        li      o12 , 12
+        li      o16 , 16
+        li      o32 , 32
+        li      o48 , 48
+       addi    TBUFFER, SP, 360
+
+
+#ifdef __64BIT__
+       addi    T1, SP, 304
+#else
+       addi    T1, SP, 224
+#endif
+
+       lxsspx  alpha_r, 0, T1
+       lxsspx  alpha_i, o8, T1
+
+       .align 5
+
+#include "ctrmm_logic_8x4_power8.S"
+
+.L999:
+       addi    r3, 0, 0
+
+       lfd     f14,    0(SP)
+       lfd     f15,    8(SP)
+       lfd     f16,   16(SP)
+       lfd     f17,   24(SP)
+
+       lfd     f18,   32(SP)
+       lfd     f19,   40(SP)
+       lfd     f20,   48(SP)
+       lfd     f21,   56(SP)
+
+       lfd     f22,   64(SP)
+       lfd     f23,   72(SP)
+       lfd     f24,   80(SP)
+       lfd     f25,   88(SP)
+
+       lfd     f26,   96(SP)
+       lfd     f27,  104(SP)
+       lfd     f28,  112(SP)
+       lfd     f29,  120(SP)
+
+       lfd     f30,  128(SP)
+       lfd     f31,  136(SP)
+
+#ifdef __64BIT__
+       ld      r31,  144(SP)
+       ld      r30,  152(SP)
+       ld      r29,  160(SP)
+       ld      r28,  168(SP)
+       ld      r27,  176(SP)
+       ld      r26,  184(SP)
+       ld      r25,  192(SP)
+       ld      r24,  200(SP)
+       ld      r23,  208(SP)
+       ld      r22,  216(SP)
+       ld      r21,  224(SP)
+       ld      r20,  232(SP)
+       ld      r19,  240(SP)
+       ld      r18,  248(SP)
+       ld      r17,  256(SP)
+       ld      r16,  264(SP)
+       ld      r15,  272(SP)
+       ld      r14,  280(SP)
+       ld      r13,  288(SP)
+       ld      r12,  296(SP)
+#else
+       lwz     r31,  144(SP)
+       lwz     r30,  148(SP)
+       lwz     r29,  152(SP)
+       lwz     r28,  156(SP)
+       lwz     r27,  160(SP)
+       lwz     r26,  164(SP)
+       lwz     r25,  168(SP)
+       lwz     r24,  172(SP)
+       lwz     r23,  176(SP)
+       lwz     r22,  180(SP)
+       lwz     r21,  184(SP)
+       lwz     r20,  188(SP)
+       lwz     r19,  192(SP)
+       lwz     r18,  196(SP)
+       lwz     r17,  200(SP)
+       lwz     r16,  204(SP)
+       lwz     r15,  208(SP)
+       lwz     r14,  212(SP)
+       lwz     r13,  216(SP)
+#endif
+
+       addi    SP, SP, STACKSIZE
+
+       blr
+
+       EPILOGUE
+#endif
diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S
new file mode 100644 (file)
index 0000000..f9656e9
--- /dev/null
@@ -0,0 +1,1756 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+
+       srawi.          J,      N,      2
+       ble             .LCTRMM_L4_END
+
+.LCTRMM_L4_BEGIN:
+
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       2
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      3
+       ble             .LCTRMM_L4x8_END
+
+.LCTRMM_L4x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L4x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L4x8_SUB4
+
+.LCTRMM_L4x8_LOOP_START:
+
+       LOAD4x8_1
+       KERNEL4x8_I1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L4x8_LOOP_END
+
+       .align 5
+
+.LCTRMM_L4x8_LOOP:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x8_LOOP
+
+.LCTRMM_L4x8_LOOP_END:
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_2
+
+       KERNEL4x8_1
+       KERNEL4x8_2
+       KERNEL4x8_1
+       KERNEL4x8_E2
+
+       b               .LCTRMM_L4x8_SUB1
+
+.LCTRMM_L4x8_SUB4:
+
+       KERNEL4x8_SUBI1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+       KERNEL4x8_SUB1
+
+       b               .LCTRMM_L4x8_SUB1
+
+.LCTRMM_L4x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L4x8_SAVE
+       b               .LCTRMM_L4x8_SUB2
+
+.LCTRMM_L4x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L4x8_SAVE
+
+.LCTRMM_L4x8_SUB2:
+
+       KERNEL4x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x8_SUB2
+
+.LCTRMM_L4x8_SAVE:
+
+       SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LCTRMM_L4x8_BEGIN
+
+.LCTRMM_L4x8_END:
+
+.LCTRMM_L4x4_BEGIN:
+       andi.           T2,     M,      7
+       ble             .LCTRMM_L4x1_END
+
+       andi.           T1,     M,      4
+       ble             .LCTRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L4x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L4x4_SUB4
+
+.LCTRMM_L4x4_LOOP_START:
+
+       LOAD4x4_1
+       KERNEL4x4_I1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L4x4_LOOP_END
+
+       .align 5
+
+.LCTRMM_L4x4_LOOP:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x4_LOOP
+
+.LCTRMM_L4x4_LOOP_END:
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_2
+
+       KERNEL4x4_1
+       KERNEL4x4_2
+       KERNEL4x4_1
+       KERNEL4x4_E2
+
+       b               .LCTRMM_L4x4_SUB1
+
+.LCTRMM_L4x4_SUB4:
+
+       KERNEL4x4_SUBI1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+       KERNEL4x4_SUB1
+
+       b               .LCTRMM_L4x4_SUB1
+
+.LCTRMM_L4x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L4x4_SAVE
+       b               .LCTRMM_L4x4_SUB2
+
+.LCTRMM_L4x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L4x4_SAVE
+
+.LCTRMM_L4x4_SUB2:
+
+       KERNEL4x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x4_SUB2
+
+.LCTRMM_L4x4_SAVE:
+
+       SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L4x4_END:
+
+.LCTRMM_L4x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LCTRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L4x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L4x2_SUB4
+
+.LCTRMM_L4x2_LOOP_START:
+
+       LOAD4x2_1
+       KERNEL4x2_I1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L4x2_LOOP_END
+
+       .align 5
+
+.LCTRMM_L4x2_LOOP:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x2_LOOP
+
+.LCTRMM_L4x2_LOOP_END:
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_2
+
+       KERNEL4x2_1
+       KERNEL4x2_2
+       KERNEL4x2_1
+       KERNEL4x2_E2
+
+       b               .LCTRMM_L4x2_SUB1
+
+.LCTRMM_L4x2_SUB4:
+
+       KERNEL4x2_SUBI1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+       KERNEL4x2_SUB1
+
+       b               .LCTRMM_L4x2_SUB1
+
+.LCTRMM_L4x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L4x2_SAVE
+       b               .LCTRMM_L4x2_SUB2
+
+.LCTRMM_L4x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L4x2_SAVE
+
+.LCTRMM_L4x2_SUB2:
+
+       KERNEL4x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x2_SUB2
+
+.LCTRMM_L4x2_SAVE:
+
+       SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L4x2_END:
+
+.LCTRMM_L4x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LCTRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     5                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     4                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L4x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L4x1_SUB4
+
+.LCTRMM_L4x1_LOOP_START:
+
+       LOAD4x1_1
+       KERNEL4x1_I1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L4x1_LOOP_END
+
+       .align 5
+
+.LCTRMM_L4x1_LOOP:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x1_LOOP
+
+.LCTRMM_L4x1_LOOP_END:
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_2
+
+       KERNEL4x1_1
+       KERNEL4x1_2
+       KERNEL4x1_1
+       KERNEL4x1_E2
+
+       b               .LCTRMM_L4x1_SUB1
+
+.LCTRMM_L4x1_SUB4:
+
+       KERNEL4x1_SUBI1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+       KERNEL4x1_SUB1
+
+       b               .LCTRMM_L4x1_SUB1
+
+.LCTRMM_L4x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL4x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L4x1_SAVE
+       b               .LCTRMM_L4x1_SUB2
+
+.LCTRMM_L4x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L4x1_SAVE
+
+.LCTRMM_L4x1_SUB2:
+
+       KERNEL4x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L4x1_SUB2
+
+.LCTRMM_L4x1_SAVE:
+
+       SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     5                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L4x1_END:
+
+       slwi            T1,     K,      5
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     4                                       // KK += Number of values in B
+#endif
+
+
+       addic.          J,      J,      -1
+       bgt             .LCTRMM_L4_BEGIN
+
+       andi.           T2,     N,      3
+       ble             .L999_H2
+
+.LCTRMM_L4_END:
+
+       b               .LCTRMM_L2_BEGIN
+
+.L999_H1:
+
+       b               .L999_H2
+
+.LCTRMM_L2_BEGIN:
+
+       andi.           T1,     N,      2
+       ble             .LCTRMM_L2_END
+       mr              CO,     C
+       mr              AO,     A
+       slwi            T1,     LDC     ,       1
+       add             C,      C,      T1
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      3
+       ble             .LCTRMM_L2x8_END
+
+.LCTRMM_L2x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L2x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L2x8_SUB4
+
+.LCTRMM_L2x8_LOOP_START:
+
+       LOAD2x8_1
+       KERNEL2x8_I1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L2x8_LOOP_END
+
+       .align 5
+
+.LCTRMM_L2x8_LOOP:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x8_LOOP
+
+.LCTRMM_L2x8_LOOP_END:
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_2
+
+       KERNEL2x8_1
+       KERNEL2x8_2
+       KERNEL2x8_1
+       KERNEL2x8_E2
+
+       b               .LCTRMM_L2x8_SUB1
+
+.LCTRMM_L2x8_SUB4:
+
+       KERNEL2x8_SUBI1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+       KERNEL2x8_SUB1
+
+       b               .LCTRMM_L2x8_SUB1
+
+.LCTRMM_L2x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L2x8_SAVE
+       b               .LCTRMM_L2x8_SUB2
+
+.LCTRMM_L2x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L2x8_SAVE
+
+.LCTRMM_L2x8_SUB2:
+
+       KERNEL2x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x8_SUB2
+
+.LCTRMM_L2x8_SAVE:
+
+       SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LCTRMM_L2x8_BEGIN
+
+.LCTRMM_L2x8_END:
+
+.LCTRMM_L2x4_BEGIN:
+       andi.           T2,     M,      7
+       ble             .LCTRMM_L2x1_END
+
+       andi.           T1,     M,      4
+       ble             .LCTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L2x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L2x4_SUB4
+
+.LCTRMM_L2x4_LOOP_START:
+
+       LOAD2x4_1
+       KERNEL2x4_I1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L2x4_LOOP_END
+
+       .align 5
+
+.LCTRMM_L2x4_LOOP:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x4_LOOP
+
+.LCTRMM_L2x4_LOOP_END:
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_2
+
+       KERNEL2x4_1
+       KERNEL2x4_2
+       KERNEL2x4_1
+       KERNEL2x4_E2
+
+       b               .LCTRMM_L2x4_SUB1
+
+.LCTRMM_L2x4_SUB4:
+
+       KERNEL2x4_SUBI1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+       KERNEL2x4_SUB1
+
+       b               .LCTRMM_L2x4_SUB1
+
+.LCTRMM_L2x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L2x4_SAVE
+       b               .LCTRMM_L2x4_SUB2
+
+.LCTRMM_L2x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L2x4_SAVE
+
+.LCTRMM_L2x4_SUB2:
+
+       KERNEL2x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x4_SUB2
+
+.LCTRMM_L2x4_SAVE:
+
+       SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L2x4_END:
+
+.LCTRMM_L2x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LCTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L2x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L2x2_SUB4
+
+.LCTRMM_L2x2_LOOP_START:
+
+       LOAD2x2_1
+       KERNEL2x2_I1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L2x2_LOOP_END
+
+       .align 5
+
+.LCTRMM_L2x2_LOOP:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x2_LOOP
+
+.LCTRMM_L2x2_LOOP_END:
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_2
+
+       KERNEL2x2_1
+       KERNEL2x2_2
+       KERNEL2x2_1
+       KERNEL2x2_E2
+
+       b               .LCTRMM_L2x2_SUB1
+
+.LCTRMM_L2x2_SUB4:
+
+       KERNEL2x2_SUBI1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+       KERNEL2x2_SUB1
+
+       b               .LCTRMM_L2x2_SUB1
+
+.LCTRMM_L2x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L2x2_SAVE
+       b               .LCTRMM_L2x2_SUB2
+
+.LCTRMM_L2x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L2x2_SAVE
+
+.LCTRMM_L2x2_SUB2:
+
+       KERNEL2x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x2_SUB2
+
+.LCTRMM_L2x2_SAVE:
+
+       SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L2x2_END:
+
+.LCTRMM_L2x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LCTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     4                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     2                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L2x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L2x1_SUB4
+
+.LCTRMM_L2x1_LOOP_START:
+
+       LOAD2x1_1
+       KERNEL2x1_I1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L2x1_LOOP_END
+
+       .align 5
+
+.LCTRMM_L2x1_LOOP:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x1_LOOP
+
+.LCTRMM_L2x1_LOOP_END:
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_2
+
+       KERNEL2x1_1
+       KERNEL2x1_2
+       KERNEL2x1_1
+       KERNEL2x1_E2
+
+       b               .LCTRMM_L2x1_SUB1
+
+.LCTRMM_L2x1_SUB4:
+
+       KERNEL2x1_SUBI1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+       KERNEL2x1_SUB1
+
+       b               .LCTRMM_L2x1_SUB1
+
+.LCTRMM_L2x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL2x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L2x1_SAVE
+       b               .LCTRMM_L2x1_SUB2
+
+.LCTRMM_L2x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L2x1_SAVE
+
+.LCTRMM_L2x1_SUB2:
+
+       KERNEL2x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L2x1_SUB2
+
+.LCTRMM_L2x1_SAVE:
+
+       SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     4                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L2x1_END:
+
+       slwi            T1,     K,      4
+       add             B,      B,      T1
+
+#if !defined(LEFT)
+       addi            KK,     KK,     2                                       // KK += Number of values in B
+#endif
+
+
+.LCTRMM_L2_END:
+
+       b               .LCTRMM_L1_BEGIN
+
+.L999_H2:
+
+       b               .L999
+
+.LCTRMM_L1_BEGIN:
+
+       andi.           T1,     N,      1
+       ble             .LCTRMM_L1_END
+       mr              CO,     C
+       mr              AO,     A
+
+#if defined(LEFT)
+       mr              KK,     OFFSET          // OFFSET -> KK
+#endif
+
+       srawi.          I,      M,      3
+       ble             .LCTRMM_L1x8_END
+
+.LCTRMM_L1x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     6                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     8                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L1x8_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L1x8_SUB4
+
+.LCTRMM_L1x8_LOOP_START:
+
+       LOAD1x8_1
+       KERNEL1x8_I1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L1x8_LOOP_END
+
+       .align 5
+
+.LCTRMM_L1x8_LOOP:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x8_LOOP
+
+.LCTRMM_L1x8_LOOP_END:
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_2
+
+       KERNEL1x8_1
+       KERNEL1x8_2
+       KERNEL1x8_1
+       KERNEL1x8_E2
+
+       b               .LCTRMM_L1x8_SUB1
+
+.LCTRMM_L1x8_SUB4:
+
+       KERNEL1x8_SUBI1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+       KERNEL1x8_SUB1
+
+       b               .LCTRMM_L1x8_SUB1
+
+.LCTRMM_L1x8_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x8_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L1x8_SAVE
+       b               .LCTRMM_L1x8_SUB2
+
+.LCTRMM_L1x8_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L1x8_SAVE
+
+.LCTRMM_L1x8_SUB2:
+
+       KERNEL1x8_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x8_SUB2
+
+.LCTRMM_L1x8_SAVE:
+
+       SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     6                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     8                               // KK += Number of values in A
+#endif
+
+
+       addic.          I,      I,      -1
+       bgt             .LCTRMM_L1x8_BEGIN
+
+.LCTRMM_L1x8_END:
+
+.LCTRMM_L1x4_BEGIN:
+       andi.           T2,     M,      7
+       ble             .LCTRMM_L1x1_END
+
+       andi.           T1,     M,      4
+       ble             .LCTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     5                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     4                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L1x4_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L1x4_SUB4
+
+.LCTRMM_L1x4_LOOP_START:
+
+       LOAD1x4_1
+       KERNEL1x4_I1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L1x4_LOOP_END
+
+       .align 5
+
+.LCTRMM_L1x4_LOOP:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x4_LOOP
+
+.LCTRMM_L1x4_LOOP_END:
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_2
+
+       KERNEL1x4_1
+       KERNEL1x4_2
+       KERNEL1x4_1
+       KERNEL1x4_E2
+
+       b               .LCTRMM_L1x4_SUB1
+
+.LCTRMM_L1x4_SUB4:
+
+       KERNEL1x4_SUBI1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+       KERNEL1x4_SUB1
+
+       b               .LCTRMM_L1x4_SUB1
+
+.LCTRMM_L1x4_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x4_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L1x4_SAVE
+       b               .LCTRMM_L1x4_SUB2
+
+.LCTRMM_L1x4_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L1x4_SAVE
+
+.LCTRMM_L1x4_SUB2:
+
+       KERNEL1x4_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x4_SUB2
+
+.LCTRMM_L1x4_SAVE:
+
+       SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     5                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     4                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L1x4_END:
+
+.LCTRMM_L1x2_BEGIN:
+
+       andi.           T1,     M,      2
+       ble             .LCTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     4                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     2                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L1x2_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L1x2_SUB4
+
+.LCTRMM_L1x2_LOOP_START:
+
+       LOAD1x2_1
+       KERNEL1x2_I1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L1x2_LOOP_END
+
+       .align 5
+
+.LCTRMM_L1x2_LOOP:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x2_LOOP
+
+.LCTRMM_L1x2_LOOP_END:
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_2
+
+       KERNEL1x2_1
+       KERNEL1x2_2
+       KERNEL1x2_1
+       KERNEL1x2_E2
+
+       b               .LCTRMM_L1x2_SUB1
+
+.LCTRMM_L1x2_SUB4:
+
+       KERNEL1x2_SUBI1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+       KERNEL1x2_SUB1
+
+       b               .LCTRMM_L1x2_SUB1
+
+.LCTRMM_L1x2_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x2_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L1x2_SAVE
+       b               .LCTRMM_L1x2_SUB2
+
+.LCTRMM_L1x2_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L1x2_SAVE
+
+.LCTRMM_L1x2_SUB2:
+
+       KERNEL1x2_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x2_SUB2
+
+.LCTRMM_L1x2_SAVE:
+
+       SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     4                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     2                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L1x2_END:
+
+.LCTRMM_L1x1_BEGIN:
+
+       andi.           T1,     M,      1
+       ble             .LCTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       mr              BO,     B                                       // B -> BO
+#else
+       mr              BO,     B                                       // B -> BO
+       slwi            T1,     KK,     3                               // Number of values in B shifted
+       slwi            T2,     KK,     3                               // Number of values in A shifted
+       add             BO,     BO,     T1                              // Add values to BO
+       add             AO,     AO,     T2                              // Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+       sub             T1,     K,      KK                              // K - KK -> TEMP1
+#else
+       mr              T1,     KK                                      // KK -> KTEMP
+#ifdef LEFT
+       addi            T1,     T1,     1                               // KTEMP + Number of values in A -> KTEMP
+#else
+       addi            T1,     T1,     1                               // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+       mr              KKK,    T1
+       mr              K1,     T1
+       srawi.          L,      K1,     3                               // KTEMP / 8 -> L
+       ble             .LCTRMM_L1x1_SUB0
+       cmpwi           cr0,    L,      1
+       ble             .LCTRMM_L1x1_SUB4
+
+.LCTRMM_L1x1_LOOP_START:
+
+       LOAD1x1_1
+       KERNEL1x1_I1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -2
+       ble             .LCTRMM_L1x1_LOOP_END
+
+       .align 5
+
+.LCTRMM_L1x1_LOOP:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x1_LOOP
+
+.LCTRMM_L1x1_LOOP_END:
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_2
+
+       KERNEL1x1_1
+       KERNEL1x1_2
+       KERNEL1x1_1
+       KERNEL1x1_E2
+
+       b               .LCTRMM_L1x1_SUB1
+
+.LCTRMM_L1x1_SUB4:
+
+       KERNEL1x1_SUBI1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+       KERNEL1x1_SUB1
+
+       b               .LCTRMM_L1x1_SUB1
+
+.LCTRMM_L1x1_SUB0:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+
+       KERNEL1x1_SUBI1
+
+       addic.          L,      L,      -1
+       ble             .LCTRMM_L1x1_SAVE
+       b               .LCTRMM_L1x1_SUB2
+
+.LCTRMM_L1x1_SUB1:
+
+       andi.           L,      K1,     7                                               // K1 & 7 -> L
+       ble             .LCTRMM_L1x1_SAVE
+
+.LCTRMM_L1x1_SUB2:
+
+       KERNEL1x1_SUB1
+
+       addic.          L,      L,      -1
+       bgt             .LCTRMM_L1x1_SUB2
+
+.LCTRMM_L1x1_SAVE:
+
+       SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+       sub             T1,     K,      KKK                                     // K - KKK -> TEMP1
+       slwi            T2,     T1,     3                       // TEMP1 * Number of values in B shifted -> TEMP2
+       slwi            T1,     T1,     3                       // TEMP1 * Number of values in A shifted -> TEMP1
+       add             BO,     BO,     T2                                      // BO += TEMP2 * number of values in B shifted
+       add             AO,     AO,     T1                                      // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+       addi            KK,     KK,     1                               // KK += Number of values in A
+#endif
+
+
+.LCTRMM_L1x1_END:
+
+#if !defined(LEFT)
+       addi            KK,     KK,     1                                       // KK += Number of values in B
+#endif
+
+
+.LCTRMM_L1_END:
diff --git a/param.h b/param.h
index f5d1ab2..980650e 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1972,23 +1972,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_N 8
 #define DGEMM_DEFAULT_UNROLL_M 16
 #define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
 #define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
-#define CGEMM_DEFAULT_P  488
+#define CGEMM_DEFAULT_P  480
 #define ZGEMM_DEFAULT_P  240
 
 #define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
-#define CGEMM_DEFAULT_Q  400
+#define CGEMM_DEFAULT_Q  720
 #define ZGEMM_DEFAULT_Q  360
 
 #define SGEMM_DEFAULT_R 28800
 #define DGEMM_DEFAULT_R 14400
+#define CGEMM_DEFAULT_R 14400
 #define ZGEMM_DEFAULT_R 7200
 
 #define SYMV_P  8