updated optimized cgemm- and ctrmm-kernel for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Mon, 4 Apr 2016 07:12:08 +0000 (09:12 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Mon, 4 Apr 2016 07:12:08 +0000 (09:12 +0200)
kernel/power/cgemm_kernel_8x4_power8.S
kernel/power/cgemm_logic_8x4_power8.S
kernel/power/cgemm_macros_8x4_power8.S
kernel/power/ctrmm_kernel_8x4_power8.S
kernel/power/ctrmm_logic_8x4_power8.S
kernel/power/ctrmm_macros_8x4_power8.S [new file with mode: 0644]
param.h

index a7e7066..f90069e 100644 (file)
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
@@ -137,12 +137,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha_si vs31
 
 
-#define NOTUSED        r14
+#define BBUFFER        r14
 #define L      r15
 #define o12    r16
 #define o4     r17
 #define T2     r19
-#define KK     r20
+#define BBO    r20
 #define        o8      r21
 #define        I       r22
 #define J      r23
@@ -290,6 +290,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        li      o32 , 32
        li      o48 , 48
        
+        li      T1, 256
+        slwi    T1, T1, 9               // 131072
+        sub     BBUFFER, A, T1          // temp buffer for B unrolled
+
 
 #ifdef __64BIT__
        addi    T1 , SP, 296
index 851a09a..db2a57f 100644 (file)
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
@@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 CGEMM_L4_BEGIN:
 
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      3
+
+CGEMM_L4_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             CGEMM_L4_COPYB
+
+
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       2
@@ -48,7 +81,7 @@ CGEMM_L4_BEGIN:
 CGEMM_L4x8_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L4x8_SUB0
        cmpwi           cr0,    L,      1
@@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START:
        dcbt            AO,     PRE
        dcbt            BO,     PRE
        LOAD4x8_1
+       dcbt            BO,     PRE
        KERNEL4x8_I1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
+       dcbt            BO,     PRE
        KERNEL4x8_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
 
+       dcbt            BO,     PRE
        KERNEL4x8_1
-       dcbt            AO,     PRE
        dcbt            BO,     PRE
+       dcbt            AO,     PRE
        KERNEL4x8_2
+       dcbt            BO,     PRE
        KERNEL4x8_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
 
@@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START:
 
 CGEMM_L4x8_LOOP:
 
+       dcbt            BO,     PRE
        KERNEL4x8_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
+       dcbt            BO,     PRE
        KERNEL4x8_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
 
+       dcbt            BO,     PRE
        KERNEL4x8_1
-       dcbt            AO,     PRE
        dcbt            BO,     PRE
+       dcbt            AO,     PRE
        KERNEL4x8_2
+       dcbt            BO,     PRE
        KERNEL4x8_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
 
@@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP:
 
 CGEMM_L4x8_LOOP_END:
 
+       dcbt            BO,     PRE
        KERNEL4x8_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL4x8_2
        KERNEL4x8_1
@@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN:
 
        andi.           T1,     M,      4
        ble             CGEMM_L4x4_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L4x4_SUB0
        cmpwi           cr0,    L,      1
@@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN:
 
        andi.           T1,     M,      2
        ble             CGEMM_L4x2_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L4x2_SUB0
        cmpwi           cr0,    L,      1
@@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN:
 
        andi.           T1,     M,      1
        ble             CGEMM_L4x1_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L4x1_SUB0
        cmpwi           cr0,    L,      1
@@ -482,6 +531,39 @@ L999_H1:
 
 CGEMM_L2_BEGIN:
 
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      2
+
+CGEMM_L2_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             CGEMM_L2_COPYB
+
+
        andi.           T1,     N,      2
        ble             CGEMM_L2_END
        mr              CO,     C
@@ -494,7 +576,7 @@ CGEMM_L2_BEGIN:
 CGEMM_L2x8_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
@@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN:
 
        andi.           T1,     M,      4
        ble             CGEMM_L2x4_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
@@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN:
 
        andi.           T1,     M,      2
        ble             CGEMM_L2x2_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
@@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN:
 
        andi.           T1,     M,      1
        ble             CGEMM_L2x1_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
@@ -919,6 +1001,39 @@ L999_H2:
 
 CGEMM_L1_BEGIN:
 
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      1
+
+CGEMM_L1_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             CGEMM_L1_COPYB
+
+
        andi.           T1,     N,      1
        ble             CGEMM_L1_END
        mr              CO,     C
@@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN:
 CGEMM_L1x8_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
@@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN:
 
        andi.           T1,     M,      4
        ble             CGEMM_L1x4_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
@@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN:
 
        andi.           T1,     M,      2
        ble             CGEMM_L1x2_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
@@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN:
 
        andi.           T1,     M,      1
        ble             CGEMM_L1x1_END
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
        ble             CGEMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
index 48a2125..9a18cb1 100644 (file)
@@ -86,66 +86,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro LOAD4x8_1
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 .endm
 
 .macro KERNEL4x8_I1
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
-
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs20,   o0,     BO              //  load b2_r
+       lxvw4x          vs21,   o16,    BO              //  load b2_i
+       lxvw4x          vs22,   o32,    BO              //  load b3_r
+       lxvw4x          vs23,   o48,    BO              //  load b3_i
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -190,33 +178,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
-
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs20,   o0,     BO              //  load b2_r
+       lxvw4x          vs21,   o16,    BO              //  load b2_i
+       lxvw4x          vs22,   o32,    BO              //  load b3_r
+       lxvw4x          vs23,   o48,    BO              //  load b3_i
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -261,33 +243,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -374,33 +350,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -445,33 +415,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -515,6 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE4x8
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -571,7 +536,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -637,7 +601,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -703,7 +666,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -769,7 +731,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -841,7 +802,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -907,7 +867,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -973,7 +932,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1039,7 +997,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1111,7 +1068,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1177,7 +1133,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1243,7 +1198,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1309,7 +1263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1381,7 +1334,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1447,7 +1399,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1513,7 +1464,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1579,7 +1529,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -1607,57 +1556,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro LOAD4x4_1
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 .endm
 
 .macro KERNEL4x4_I1
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
+       lxvw4x          vs20,   o0,     BO              //  load b2_r
+       lxvw4x          vs21,   o16,    BO              //  load b2_i
+       lxvw4x          vs22,   o32,    BO              //  load b3_r
+       lxvw4x          vs23,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -1687,28 +1628,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
+       lxvw4x          vs20,   o0,     BO              //  load b2_r
+       lxvw4x          vs21,   o16,    BO              //  load b2_i
+       lxvw4x          vs22,   o32,    BO              //  load b3_r
+       lxvw4x          vs23,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -1738,29 +1675,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -1815,28 +1748,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -1866,28 +1795,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -1916,6 +1841,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE4x4
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -1972,7 +1898,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2038,7 +1963,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2110,7 +2034,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2176,7 +2099,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2248,7 +2170,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2314,7 +2235,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2386,7 +2306,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2452,7 +2371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2481,25 +2399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 .endm
 
@@ -2508,25 +2423,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
+       lxvw4x          vs20,   o0,     BO              //  load b2_r
+       lxvw4x          vs21,   o16,    BO              //  load b2_i
+       lxvw4x          vs22,   o32,    BO              //  load b3_r
+       lxvw4x          vs23,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -2549,25 +2461,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
+       lxvw4x          vs20,   o0,     BO              //  load b2_r
+       lxvw4x          vs21,   o16,    BO              //  load b2_i
+       lxvw4x          vs22,   o32,    BO              //  load b3_r
+       lxvw4x          vs23,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -2590,26 +2499,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     32
-
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -2649,25 +2555,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -2690,25 +2593,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+       addi            BO,     BO,     64
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
+       lxvw4x          vs12,   o0,     BO              //  load b2_r
+       lxvw4x          vs13,   o16,    BO              //  load b2_i
+       lxvw4x          vs14,   o32,    BO              //  load b3_r
+       lxvw4x          vs15,   o48,    BO              //  load b3_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -2729,6 +2629,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE4x2
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -2785,7 +2686,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2857,7 +2757,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -2929,7 +2828,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -3001,7 +2899,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -3033,27 +2930,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
+       lxsspx          vs12,   o0,     BO              //  load b2_r
+       lxsspx          vs13,   o16,    BO              //  load b2_i
+       lxsspx          vs14,   o32,    BO              //  load b3_r
+       lxsspx          vs15,   o48,    BO              //  load b3_i
 
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
-
-       lxsspx          vs12,   o0,     T1              //  load b2_r
-       lxsspx          vs13,   o4,     T1              //  load b2_i
-
-       addi            T1,     T1,8
-
-       lxsspx          vs14,   o0,     T1              //  load b3_r
-       lxsspx          vs15,   o4,     T1              //  load b3_i
-
-       addi            BO,     BO,     32
 
 .endm
 
@@ -3065,27 +2955,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs16,   o0,     T1              //  load b0_r
-       lxsspx          vs17,   o4,     T1              //  load b0_i
+       lxsspx          vs16,   o0,     BO              //  load b0_r
+       lxsspx          vs17,   o16,    BO              //  load b0_i
+       lxsspx          vs18,   o32,    BO              //  load b1_r
+       lxsspx          vs19,   o48,    BO              //  load b1_i
 
-       addi            T1,     T1,8
+       addi            BO,     BO,     64
 
-       lxsspx          vs18,   o0,     T1              //  load b1_r
-       lxsspx          vs19,   o4,     T1              //  load b1_i
+       lxsspx          vs20,   o0,     BO              //  load b2_r
+       lxsspx          vs21,   o16,    BO              //  load b2_i
+       lxsspx          vs22,   o32,    BO              //  load b3_r
+       lxsspx          vs23,   o48,    BO              //  load b3_i
 
-       addi            T1,     T1,8
+       addi            BO,     BO,     64
 
-       lxsspx          vs20,   o0,     T1              //  load b2_r
-       lxsspx          vs21,   o4,     T1              //  load b2_i
-
-       addi            T1,     T1,8
-
-       lxsspx          vs22,   o0,     T1              //  load b3_r
-       lxsspx          vs23,   o4,     T1              //  load b3_i
-
-       addi            BO,     BO,     32
 
 
        xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
@@ -3119,27 +3002,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs16,   o0,     T1              //  load b0_r
-       lxsspx          vs17,   o4,     T1              //  load b0_i
-
-       addi            T1,     T1,8
+       lxsspx          vs16,   o0,     BO              //  load b0_r
+       lxsspx          vs17,   o16,    BO              //  load b0_i
+       lxsspx          vs18,   o32,    BO              //  load b1_r
+       lxsspx          vs19,   o48,    BO              //  load b1_i
 
-       lxsspx          vs18,   o0,     T1              //  load b1_r
-       lxsspx          vs19,   o4,     T1              //  load b1_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
+       lxsspx          vs20,   o0,     BO              //  load b2_r
+       lxsspx          vs21,   o16,    BO              //  load b2_i
+       lxsspx          vs22,   o32,    BO              //  load b3_r
+       lxsspx          vs23,   o48,    BO              //  load b3_i
 
-       lxsspx          vs20,   o0,     T1              //  load b2_r
-       lxsspx          vs21,   o4,     T1              //  load b2_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
-
-       lxsspx          vs22,   o0,     T1              //  load b3_r
-       lxsspx          vs23,   o4,     T1              //  load b3_i
-
-       addi            BO,     BO,     32
 
 
        xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
@@ -3173,27 +3049,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
-
-       addi            T1,     T1,8
-
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       addi            T1,     T1,8
+       addi            BO,     BO,     64
 
-       lxsspx          vs12,   o0,     T1              //  load b2_r
-       lxsspx          vs13,   o4,     T1              //  load b2_i
+       lxsspx          vs12,   o0,     BO              //  load b2_r
+       lxsspx          vs13,   o16,    BO              //  load b2_i
+       lxsspx          vs14,   o32,    BO              //  load b3_r
+       lxsspx          vs15,   o48,    BO              //  load b3_i
 
-       addi            T1,     T1,8
+       addi            BO,     BO,     64
 
-       lxsspx          vs14,   o0,     T1              //  load b3_r
-       lxsspx          vs15,   o4,     T1              //  load b3_i
-
-       addi            BO,     BO,     32
 
 
        xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
@@ -3253,27 +3122,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
-
-       addi            T1,     T1,8
-
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
-
-       addi            T1,     T1,8
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       lxsspx          vs12,   o0,     T1              //  load b2_r
-       lxsspx          vs13,   o4,     T1              //  load b2_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
+       lxsspx          vs12,   o0,     BO              //  load b2_r
+       lxsspx          vs13,   o16,    BO              //  load b2_i
+       lxsspx          vs14,   o32,    BO              //  load b3_r
+       lxsspx          vs15,   o48,    BO              //  load b3_i
 
-       lxsspx          vs14,   o0,     T1              //  load b3_r
-       lxsspx          vs15,   o4,     T1              //  load b3_i
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     32
 
 
        xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
@@ -3307,27 +3169,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
-
-       addi            T1,     T1,8
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
+       lxsspx          vs12,   o0,     BO              //  load b2_r
+       lxsspx          vs13,   o16,    BO              //  load b2_i
+       lxsspx          vs14,   o32,    BO              //  load b3_r
+       lxsspx          vs15,   o48,    BO              //  load b3_i
 
-       lxsspx          vs12,   o0,     T1              //  load b2_r
-       lxsspx          vs13,   o4,     T1              //  load b2_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
-
-       lxsspx          vs14,   o0,     T1              //  load b3_r
-       lxsspx          vs15,   o4,     T1              //  load b3_i
-
-       addi            BO,     BO,     32
 
 
        xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
@@ -3356,6 +3211,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE4x1
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -3536,25 +3392,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro LOAD2x8_1
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 .endm
 
@@ -3562,25 +3412,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
-
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -3608,26 +3452,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
-
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -3654,25 +3492,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -3724,25 +3556,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -3770,26 +3596,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -3815,6 +3635,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE2x8
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -3871,7 +3692,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -3937,7 +3757,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4003,7 +3822,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4069,7 +3887,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4141,7 +3958,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4207,7 +4023,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4273,7 +4088,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4339,7 +4153,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4367,44 +4180,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro LOAD2x4_1
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 .endm
 
 .macro KERNEL2x4_I1
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -4423,22 +4228,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -4457,22 +4258,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -4507,22 +4304,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -4541,22 +4334,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -4574,6 +4363,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE2x4
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -4630,7 +4420,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4696,7 +4485,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4768,7 +4556,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4834,7 +4621,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -4863,18 +4649,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 .endm
 
@@ -4883,19 +4666,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -4911,18 +4691,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
+       lxvw4x          vs18,   o32,    BO              //  load b1_r
+       lxvw4x          vs19,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -4939,18 +4716,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -4979,19 +4753,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       addi            BO,     BO,     64
 
 
-       addi            BO,     BO,     16
-
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
@@ -5007,18 +4778,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
+       lxvw4x          vs10,   o32,    BO              //  load b1_r
+       lxvw4x          vs11,   o48,    BO              //  load b1_i
 
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5033,6 +4801,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE2x2
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -5089,7 +4858,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -5161,7 +4929,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -5193,17 +4960,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
-
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
-
-       addi            BO,     BO,     16
 
 .endm
 
@@ -5215,17 +4978,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs16,   o0,     T1              //  load b0_r
-       lxsspx          vs17,   o4,     T1              //  load b0_i
+       lxsspx          vs16,   o0,     BO              //  load b0_r
+       lxsspx          vs17,   o16,    BO              //  load b0_i
+       lxsspx          vs18,   o32,    BO              //  load b1_r
+       lxsspx          vs19,   o48,    BO              //  load b1_i
 
-       addi            T1,     T1,8
+       addi            BO,     BO,     64
 
-       lxsspx          vs18,   o0,     T1              //  load b1_r
-       lxsspx          vs19,   o4,     T1              //  load b1_i
-
-       addi            BO,     BO,     16
 
 
        xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
@@ -5249,17 +5008,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs16,   o0,     T1              //  load b0_r
-       lxsspx          vs17,   o4,     T1              //  load b0_i
-
-       addi            T1,     T1,8
+       lxsspx          vs16,   o0,     BO              //  load b0_r
+       lxsspx          vs17,   o16,    BO              //  load b0_i
+       lxsspx          vs18,   o32,    BO              //  load b1_r
+       lxsspx          vs19,   o48,    BO              //  load b1_i
 
-       lxsspx          vs18,   o0,     T1              //  load b1_r
-       lxsspx          vs19,   o4,     T1              //  load b1_i
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
@@ -5283,17 +5038,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       addi            BO,     BO,     64
 
-       addi            T1,     T1,8
-
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
-
-       addi            BO,     BO,     16
 
 
        xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
@@ -5333,17 +5084,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       addi            T1,     T1,8
+       addi            BO,     BO,     64
 
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
-
-       addi            BO,     BO,     16
 
 
        xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
@@ -5367,17 +5114,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
-
-       addi            T1,     T1,8
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
+       lxsspx          vs10,   o32,    BO              //  load b1_r
+       lxsspx          vs11,   o48,    BO              //  load b1_i
 
-       lxsspx          vs10,   o0,     T1              //  load b1_r
-       lxsspx          vs11,   o4,     T1              //  load b1_i
+       addi            BO,     BO,     64
 
-       addi            BO,     BO,     16
 
 
        xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
@@ -5396,6 +5139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE2x1
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -5492,27 +5236,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro LOAD1x8_1
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -5520,27 +5253,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
-
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
-
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5559,27 +5281,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
-
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
 
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5598,27 +5309,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5652,27 +5352,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5691,27 +5380,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
-
        lxvw4x          vs2,    o32,    AO              // load a4, a5
-
        lxvw4x          vs3,    o48,    AO              // load a6, a7
 
-
        addi            AO,     AO,     64
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5729,6 +5407,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE1x8
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -5785,7 +5464,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -5851,7 +5529,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -5917,7 +5594,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -5983,7 +5659,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -6011,23 +5686,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro LOAD1x4_1
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -6035,23 +5701,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
-
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
 
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6066,23 +5723,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
-
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6097,23 +5745,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6139,23 +5778,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-
-
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6170,23 +5800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
-
        lxvw4x          vs1,    o16,    AO              // load a2, a3
 
-
        addi            AO,     AO,     32
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6200,6 +5821,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE1x4
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -6256,7 +5878,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -6322,7 +5943,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -6351,20 +5971,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -6373,20 +5985,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6400,20 +6004,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs16,   o0,     BO              //  load b0_r
+       lxvw4x          vs17,   o16,    BO              //  load b0_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6427,20 +6023,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6463,20 +6051,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6490,20 +6070,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-
        addi            AO,     AO,     16
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
-
-
+       lxvw4x          vs8,    o0,     BO              //  load b0_r
+       lxvw4x          vs9,    o16,    BO              //  load b0_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6515,6 +6087,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE1x2
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
@@ -6571,7 +6144,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
        xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       xxlxor          vs24,   vs24,   vs24
        xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
        xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
        xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
@@ -6603,12 +6175,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -6620,12 +6190,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs16,   o0,     T1              //  load b0_r
-       lxsspx          vs17,   o4,     T1              //  load b0_i
+       lxsspx          vs16,   o0,     BO              //  load b0_r
+       lxsspx          vs17,   o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
@@ -6644,12 +6212,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
+       lxsspx          vs16,   o0,     BO              //  load b0_r
+       lxsspx          vs17,   o16,    BO              //  load b0_i
 
-       lxsspx          vs16,   o0,     T1              //  load b0_r
-       lxsspx          vs17,   o4,     T1              //  load b0_i
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
@@ -6668,12 +6234,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
@@ -6703,12 +6267,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
-
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
@@ -6727,12 +6289,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     8
 
-       mr              T1,     BO
+       lxsspx          vs8,    o0,     BO              //  load b0_r
+       lxsspx          vs9,    o16,    BO              //  load b0_i
 
-       lxsspx          vs8,    o0,     T1              //  load b0_r
-       lxsspx          vs9,    o4,     T1              //  load b0_i
-
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
@@ -6746,6 +6306,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro SAVE1x1
 
        mr              T1,     CO
+       xxlxor          vs24,   vs24,   vs24
 
 // N=0
 
index b202114..460a387 100644 (file)
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
@@ -275,7 +275,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#include "cgemm_macros_8x4_power8.S"
+#include "ctrmm_macros_8x4_power8.S"
 
        cmpwi   cr0, M, 0
        ble     L999_H1
index 3e50646..9ab2585 100644 (file)
@@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
@@ -83,15 +83,22 @@ CTRMM_L4x8_BEGIN:
 
 CTRMM_L4x8_LOOP_START:
 
+       dcbt            AO, PRE
+       dcbt            BO, PRE
        LOAD4x8_1
        KERNEL4x8_I1
+       dcbt            AO, PRE
        KERNEL4x8_2
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
 
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
        KERNEL4x8_1
+       dcbt            AO, PRE
+       dcbt            BO, PRE
        KERNEL4x8_2
 
        addic.          L,      L,      -2
@@ -102,13 +109,18 @@ CTRMM_L4x8_LOOP_START:
 CTRMM_L4x8_LOOP:
 
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
 
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
        KERNEL4x8_1
+       dcbt            AO, PRE
+       dcbt            BO, PRE
        KERNEL4x8_2
 
        addic.          L,      L,      -1
@@ -117,8 +129,10 @@ CTRMM_L4x8_LOOP:
 CTRMM_L4x8_LOOP_END:
 
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
        KERNEL4x8_1
+       dcbt            AO, PRE
        KERNEL4x8_2
 
        KERNEL4x8_1
diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S
new file mode 100644 (file)
index 0000000..48a2125
--- /dev/null
@@ -0,0 +1,6794 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xssubdp
+       #define XSFADD_I1   xsadddp
+       #define XSFADD_I2   xsadddp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvsubsp
+       #define XVFADD_I1   xvaddsp
+       #define XVFADD_I2   xvaddsp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xsadddp
+       #define XSFADD_I1   xssubdp
+       #define XSFADD_I2   xsadddp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvaddsp
+       #define XVFADD_I1   xvsubsp
+       #define XVFADD_I2   xvaddsp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xsadddp
+       #define XSFADD_I1   xsadddp
+       #define XSFADD_I2   xssubdp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvaddsp
+       #define XVFADD_I1   xvaddsp
+       #define XVFADD_I2   xvsubsp
+
+#else             // CC || CR || RC || RR
+
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xssubdp
+       #define XSFADD_I1   xssubdp
+       #define XSFADD_I2   xssubdp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvsubsp
+       #define XVFADD_I1   xvsubsp
+       #define XVFADD_I2   xvsubsp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs6,    vs20            // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs6,    vs21            // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs7,    vs20            // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs7,    vs21            // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs56,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs60,   vs6,    vs22            // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs6,    vs23            // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs62,   vs7,    vs22            // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs7,    vs23            // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs6,    vs20            // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs6,    vs21            // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs7,    vs20            // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs7,    vs21            // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs56,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs60,   vs6,    vs22            // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs6,    vs23            // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs62,   vs7,    vs22            // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs7,    vs23            // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs48,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs49,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs50,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs51,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs52,   vs2,    vs12            // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs53,   vs2,    vs13            // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x8
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
+
+
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
+
+
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs40,   0
+       xxspltw         vs9,    vs40,   1
+       xxspltw         vs10,   vs40,   2
+       xxspltw         vs11,   vs40,   3
+
+
+       xxspltw         vs12,   vs41,   0
+       xxspltw         vs13,   vs41,   1
+       xxspltw         vs14,   vs41,   2
+       xxspltw         vs15,   vs41,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs42,   0
+       xxspltw         vs9,    vs42,   1
+       xxspltw         vs10,   vs42,   2
+       xxspltw         vs11,   vs42,   3
+
+
+       xxspltw         vs12,   vs43,   0
+       xxspltw         vs13,   vs43,   1
+       xxspltw         vs14,   vs43,   2
+       xxspltw         vs15,   vs43,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs44,   0
+       xxspltw         vs9,    vs44,   1
+       xxspltw         vs10,   vs44,   2
+       xxspltw         vs11,   vs44,   3
+
+
+       xxspltw         vs12,   vs45,   0
+       xxspltw         vs13,   vs45,   1
+       xxspltw         vs14,   vs45,   2
+       xxspltw         vs15,   vs45,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs46,   0
+       xxspltw         vs9,    vs46,   1
+       xxspltw         vs10,   vs46,   2
+       xxspltw         vs11,   vs46,   3
+
+
+       xxspltw         vs12,   vs47,   0
+       xxspltw         vs13,   vs47,   1
+       xxspltw         vs14,   vs47,   2
+       xxspltw         vs15,   vs47,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs48,   0
+       xxspltw         vs9,    vs48,   1
+       xxspltw         vs10,   vs48,   2
+       xxspltw         vs11,   vs48,   3
+
+
+       xxspltw         vs12,   vs49,   0
+       xxspltw         vs13,   vs49,   1
+       xxspltw         vs14,   vs49,   2
+       xxspltw         vs15,   vs49,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs50,   0
+       xxspltw         vs9,    vs50,   1
+       xxspltw         vs10,   vs50,   2
+       xxspltw         vs11,   vs50,   3
+
+
+       xxspltw         vs12,   vs51,   0
+       xxspltw         vs13,   vs51,   1
+       xxspltw         vs14,   vs51,   2
+       xxspltw         vs15,   vs51,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs52,   0
+       xxspltw         vs9,    vs52,   1
+       xxspltw         vs10,   vs52,   2
+       xxspltw         vs11,   vs52,   3
+
+
+       xxspltw         vs12,   vs53,   0
+       xxspltw         vs13,   vs53,   1
+       xxspltw         vs14,   vs53,   2
+       xxspltw         vs15,   vs53,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs54,   0
+       xxspltw         vs9,    vs54,   1
+       xxspltw         vs10,   vs54,   2
+       xxspltw         vs11,   vs54,   3
+
+
+       xxspltw         vs12,   vs55,   0
+       xxspltw         vs13,   vs55,   1
+       xxspltw         vs14,   vs55,   2
+       xxspltw         vs15,   vs55,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs56,   0
+       xxspltw         vs9,    vs56,   1
+       xxspltw         vs10,   vs56,   2
+       xxspltw         vs11,   vs56,   3
+
+
+       xxspltw         vs12,   vs57,   0
+       xxspltw         vs13,   vs57,   1
+       xxspltw         vs14,   vs57,   2
+       xxspltw         vs15,   vs57,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs58,   0
+       xxspltw         vs9,    vs58,   1
+       xxspltw         vs10,   vs58,   2
+       xxspltw         vs11,   vs58,   3
+
+
+       xxspltw         vs12,   vs59,   0
+       xxspltw         vs13,   vs59,   1
+       xxspltw         vs14,   vs59,   2
+       xxspltw         vs15,   vs59,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs60,   0
+       xxspltw         vs9,    vs60,   1
+       xxspltw         vs10,   vs60,   2
+       xxspltw         vs11,   vs60,   3
+
+
+       xxspltw         vs12,   vs61,   0
+       xxspltw         vs13,   vs61,   1
+       xxspltw         vs14,   vs61,   2
+       xxspltw         vs15,   vs61,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs62,   0
+       xxspltw         vs9,    vs62,   1
+       xxspltw         vs10,   vs62,   2
+       xxspltw         vs11,   vs62,   3
+
+
+       xxspltw         vs12,   vs63,   0
+       xxspltw         vs13,   vs63,   1
+       xxspltw         vs14,   vs63,   2
+       xxspltw         vs15,   vs63,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs5,    vs20            // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs5,    vs21            // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmulsp         vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmulsp         vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs41,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+       xvmaddasp       vs42,   vs1,    vs12            // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs43,   vs1,    vs13            // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs45,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+       xvmaddasp       vs46,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs47,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x4
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
+
+
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
+
+
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs40,   0
+       xxspltw         vs9,    vs40,   1
+       xxspltw         vs10,   vs40,   2
+       xxspltw         vs11,   vs40,   3
+
+
+       xxspltw         vs12,   vs41,   0
+       xxspltw         vs13,   vs41,   1
+       xxspltw         vs14,   vs41,   2
+       xxspltw         vs15,   vs41,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=2 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs42,   0
+       xxspltw         vs9,    vs42,   1
+       xxspltw         vs10,   vs42,   2
+       xxspltw         vs11,   vs42,   3
+
+
+       xxspltw         vs12,   vs43,   0
+       xxspltw         vs13,   vs43,   1
+       xxspltw         vs14,   vs43,   2
+       xxspltw         vs15,   vs43,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs44,   0
+       xxspltw         vs9,    vs44,   1
+       xxspltw         vs10,   vs44,   2
+       xxspltw         vs11,   vs44,   3
+
+
+       xxspltw         vs12,   vs45,   0
+       xxspltw         vs13,   vs45,   1
+       xxspltw         vs14,   vs45,   2
+       xxspltw         vs15,   vs45,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=3 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs46,   0
+       xxspltw         vs9,    vs46,   1
+       xxspltw         vs10,   vs46,   2
+       xxspltw         vs11,   vs46,   3
+
+
+       xxspltw         vs12,   vs47,   0
+       xxspltw         vs13,   vs47,   1
+       xxspltw         vs14,   vs47,   2
+       xxspltw         vs15,   vs47,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs4,    vs20            // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs4,    vs21            // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmulsp         vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmulsp         vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmulsp         vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmulsp         vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+       xvmaddasp       vs36,   vs0,    vs12            // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+       xvmaddasp       vs37,   vs0,    vs13            // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+       xvmaddasp       vs38,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+       xvmaddasp       vs39,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x2
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
+
+
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
+
+
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs20,   o0,     T1              //  load b2_r
+       lxsspx          vs21,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs22,   o0,     T1              //  load b3_r
+       lxsspx          vs23,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmuldp         vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmuldp         vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmuldp         vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmuldp         vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmuldp         vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmuldp         vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmuldp         vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmuldp         vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs20,   o0,     T1              //  load b2_r
+       lxsspx          vs21,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs22,   o0,     T1              //  load b3_r
+       lxsspx          vs23,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmaddadp       vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmaddadp       vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmaddadp       vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmaddadp       vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmaddadp       vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmaddadp       vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmaddadp       vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmaddadp       vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+       xsmaddadp       vs40,   vs4,    vs20            // a4_r*b2_r
+       xsmaddadp       vs41,   vs5,    vs21            // a4_i*b2_i
+       xsmaddadp       vs42,   vs4,    vs21            // a4_r*b2_i
+       xsmaddadp       vs43,   vs5,    vs20            // a4_i*b2_r
+
+       xsmaddadp       vs44,   vs4,    vs22            // a4_r*b3_r
+       xsmaddadp       vs45,   vs5,    vs23            // a4_i*b3_i
+       xsmaddadp       vs46,   vs4,    vs23            // a4_r*b3_i
+       xsmaddadp       vs47,   vs5,    vs22            // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+       xsmaddadp       vs40,   vs4,    vs20            // a4_r*b2_r
+       xsmaddadp       vs41,   vs5,    vs21            // a4_i*b2_i
+       xsmaddadp       vs42,   vs4,    vs21            // a4_r*b2_i
+       xsmaddadp       vs43,   vs5,    vs20            // a4_i*b2_r
+
+       xsmaddadp       vs44,   vs4,    vs22            // a4_r*b3_r
+       xsmaddadp       vs45,   vs5,    vs23            // a4_i*b3_i
+       xsmaddadp       vs46,   vs4,    vs23            // a4_r*b3_i
+       xsmaddadp       vs47,   vs5,    vs22            // a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmuldp         vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmuldp         vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmuldp         vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmuldp         vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmuldp         vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmuldp         vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmuldp         vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmuldp         vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs12,   o0,     T1              //  load b2_r
+       lxsspx          vs13,   o4,     T1              //  load b2_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs14,   o0,     T1              //  load b3_r
+       lxsspx          vs15,   o4,     T1              //  load b3_i
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+       xsmaddadp       vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmaddadp       vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmaddadp       vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmaddadp       vs43,   vs1,    vs12            // a0_i*b2_r
+
+       xsmaddadp       vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmaddadp       vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmaddadp       vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmaddadp       vs47,   vs1,    vs14            // a0_i*b3_r
+
+
+.endm
+
+.macro SAVE4x1
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs32            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs35            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs36            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs39            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs37            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs38            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=2
+
+       mr              T2,     T1
+
+// N=2 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs40            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs43            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs41            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs42            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=3
+
+       mr              T2,     T1
+
+// N=3 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs44            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs47            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs45            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs46            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs6,    vs18            // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs6,    vs19            // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs7,    vs18            // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs7,    vs19            // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs40,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs41,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs42,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs43,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs44,   vs2,    vs10            // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs45,   vs2,    vs11            // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs46,   vs3,    vs10            // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs47,   vs3,    vs11            // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x8
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
+
+
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
+
+
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs40,   0
+       xxspltw         vs9,    vs40,   1
+       xxspltw         vs10,   vs40,   2
+       xxspltw         vs11,   vs40,   3
+
+
+       xxspltw         vs12,   vs41,   0
+       xxspltw         vs13,   vs41,   1
+       xxspltw         vs14,   vs41,   2
+       xxspltw         vs15,   vs41,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs42,   0
+       xxspltw         vs9,    vs42,   1
+       xxspltw         vs10,   vs42,   2
+       xxspltw         vs11,   vs42,   3
+
+
+       xxspltw         vs12,   vs43,   0
+       xxspltw         vs13,   vs43,   1
+       xxspltw         vs14,   vs43,   2
+       xxspltw         vs15,   vs43,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs44,   0
+       xxspltw         vs9,    vs44,   1
+       xxspltw         vs10,   vs44,   2
+       xxspltw         vs11,   vs44,   3
+
+
+       xxspltw         vs12,   vs45,   0
+       xxspltw         vs13,   vs45,   1
+       xxspltw         vs14,   vs45,   2
+       xxspltw         vs15,   vs45,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs46,   0
+       xxspltw         vs9,    vs46,   1
+       xxspltw         vs10,   vs46,   2
+       xxspltw         vs11,   vs46,   3
+
+
+       xxspltw         vs12,   vs47,   0
+       xxspltw         vs13,   vs47,   1
+       xxspltw         vs14,   vs47,   2
+       xxspltw         vs15,   vs47,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs5,    vs18            // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs5,    vs19            // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmulsp         vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs37,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+       xvmaddasp       vs38,   vs1,    vs10            // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs39,   vs1,    vs11            // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x4
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
+
+
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=1 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
+
+
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs4,    vs18            // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs4,    vs19            // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmulsp         vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmulsp         vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+       xvmaddasp       vs34,   vs0,    vs10            // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+       xvmaddasp       vs35,   vs0,    vs11            // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x2
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs18,   o0,     T1              //  load b1_r
+       lxsspx          vs19,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            T1,     T1,8
+
+       lxsspx          vs10,   o0,     T1              //  load b1_r
+       lxsspx          vs11,   o4,     T1              //  load b1_i
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
+
+
+.endm
+
+.macro SAVE2x1
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs32            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs35            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+
+// N=1
+
+       mr              T2,     T1
+
+// N=1 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs36            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs39            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs37            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs38            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs6,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs7,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x8
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=4
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
+
+
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=6
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
+
+
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs5,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmulsp         vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x4
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+
+// N=0 M=2
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
+
+
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+       lxvw4x          vs4,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmulsp         vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO              // load a0, a1
+
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
+
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x2
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+       xxlxor          vs6,    vs6,    vs6
+       xxlxor          vs7,    vs7,    vs7
+
+#ifndef TRMMKERNEL
+       lxvw4x          vs0,    o0,     T2      // c0, c1
+#else
+       xxlxor          vs0,    vs0,    vs0
+#endif
+
+
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
+
+
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
+
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
+
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
+
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
+       xvaddsp         vs0,    vs0,    vs1
+
+
+       stxvw4x         vs0,    o0,     T2      // c0, c1
+
+       addi            T2,     T2,     16
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+       lxsspx          vs4,    o0,     AO              // load a0_r
+       lxsspx          vs5,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1              //  load b0_r
+       lxsspx          vs17,   o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO              // load a0_r
+       lxsspx          vs1,    o4,     AO              // load a0_i
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1              //  load b0_r
+       lxsspx          vs9,    o4,     T1              //  load b0_i
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
+
+
+.endm
+
+.macro SAVE1x1
+
+       mr              T1,     CO
+
+// N=0
+
+       mr              T2,     T1
+
+// N=0 M=0
+
+       xxlxor          vs4,    vs4,    vs4
+       xxlxor          vs5,    vs5,    vs5
+
+#ifndef TRMMKERNEL
+       lxsspx          vs0,    o0,     T2      // load c0_r
+       lxsspx          vs1,    o4,     T2      // load c0_i
+#else
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+#endif
+
+       XSFADD_R1       vs4,    vs4,    vs32            // add a0_r * b0_r
+       XSFADD_I1       vs5,    vs5,    vs35            // add a0_r * b0_i
+
+       XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
+       XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
+
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
+
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
+
+
+       stxsspx         vs0,    o0,     T2      // store c0_r
+       stxsspx         vs1,    o4,     T2      // store c0_i
+
+       addi            T2,     T2,     8
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
diff --git a/param.h b/param.h
index fb344cd..d01c992 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1979,7 +1979,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
-#define CGEMM_DEFAULT_P  480
+#define CGEMM_DEFAULT_P  720
 #define ZGEMM_DEFAULT_P  240
 
 #define SGEMM_DEFAULT_Q  720