updated zgemm- and ztrmm-kernel for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Fri, 8 Apr 2016 07:05:37 +0000 (09:05 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Fri, 8 Apr 2016 07:05:37 +0000 (09:05 +0200)
kernel/power/zgemm_kernel_8x2_power8.S
kernel/power/zgemm_logic_8x2_power8.S
kernel/power/zgemm_macros_8x2_power8.S
kernel/power/ztrmm_kernel_8x2_power8.S
kernel/power/ztrmm_macros_8x2_power8.S [new file with mode: 0644]
param.h

index a7665f7..336b13b 100644 (file)
@@ -1,38 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-*       BLASTEST               : OK
-*       CTEST                  : OK
-*       TEST                   : OK
-*       LAPACK-TEST            : OK
-**************************************************************************************/
-
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
@@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 320
+#define STACKSIZE 32000
 #define ALPHA_R_SP 296(SP)
 #define ALPHA_I_SP 304(SP)
 #define FZERO  312(SP)
@@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha_r vs30
 #define alpha_i vs31
 
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+
 #define L      r15
 #define ALPHA  r16
 #define o24    r17
 #define T2     r19
-#define KK     r20
+#define BBO    r20
 #define        o8      r21
 #define        I       r22
 #define J      r23
@@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        PROLOGUE
        PROFCODE
 
-       addi    SP, SP, -STACKSIZE
-       li      r0, 0
+       mr      FRAMEPOINTER, SP
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        li      r0, 0
 
        stfd    f14,    0(SP)
        stfd    f15,    8(SP)
@@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        std     r17,  256(SP)
        std     r16,  264(SP)
        std     r15,  272(SP)
+       std     r14,  280(SP)
 #else
        stw     r31,  144(SP)
        stw     r30,  148(SP)
@@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef linux
 #ifdef __64BIT__
-       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+       ld      LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-       ld      LDC, FRAMESLOT(0) + STACKSIZE(SP)
+       ld      LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-       lwz     B,   FRAMESLOT(0) + STACKSIZE(SP)
-       lwz     C,   FRAMESLOT(1) + STACKSIZE(SP)
-       lwz     LDC, FRAMESLOT(2) + STACKSIZE(SP)
+       lwz     B,   FRAMESLOT(0) + 0(FRAMEPOINTER)
+       lwz     C,   FRAMESLOT(1) + 0(FRAMEPOINTER)
+       lwz     LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
 #else
-       lwz     LDC, FRAMESLOT(0) + STACKSIZE(SP)
+       lwz     LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
 
 #ifdef TRMMKERNEL
 #if defined(linux) && defined(__64BIT__)
-       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+       ld      OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-       ld      OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+       ld      OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-       lwz     OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+       lwz     OFFSET,  FRAMESLOT(3) + 0(FRAMEPOINTER)
 #else
-       lwz     OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+       lwz     OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
@@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zgemm_macros_8x2_power8.S"
 
        cmpwi   cr0, M, 0
-       ble     .L999
+       ble     L999
        cmpwi   cr0, N, 0
-       ble     .L999
+       ble     L999
        cmpwi   cr0, K, 0
-       ble     .L999
+       ble     L999
 
        slwi    LDC, LDC, ZBASE_SHIFT
-       li      PRE, 256 
+       li      PRE, 384 
        li      o8  , 8
        li      o16 , 16
        li      o24 , 24
        li      o32 , 32
        li      o48 , 48
 
+        addi    BBUFFER, SP, 512+4096
+        li      T1, -4096
+        and     BBUFFER, BBUFFER, T1
+
 #ifdef __64BIT__
        addi    ALPHA, SP, 296
 #else
        addi    ALPHA, SP, 224
 #endif
 
-       lxvdsx  alpha_r, 0, ALPHA
-       lxvdsx  alpha_i, o8, ALPHA
+       lxsdx   alpha_r, 0, ALPHA
+       lxsdx   alpha_i, o8, ALPHA
 
-       .align 5
+       .align 4
 
 #include "zgemm_logic_8x2_power8.S"
 
-.L999:
+L999:
        addi    r3, 0, 0
 
        lfd     f14,    0(SP)
@@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld      r17,  256(SP)
        ld      r16,  264(SP)
        ld      r15,  272(SP)
+       ld      r14,  280(SP)
 #else
        lwz     r31,  144(SP)
        lwz     r30,  148(SP)
@@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        addi    SP, SP, STACKSIZE
+       addi    SP, SP, STACKSIZE
+       addi    SP, SP, STACKSIZE
+       addi    SP, SP, STACKSIZE
 
        blr
 
index 5fcade5..96612da 100644 (file)
        srawi.          J,      N,      1
-       ble             .LZGEMM_L2_END
+       ble             ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      1
+
+ZGEMM_L2_COPYB:
+
+       lxvdsx          vs4,    o0,     BO              // b0_r
+       lxvdsx          vs5,    o8,     BO              // b0_i
+       addi            BO,     BO,     16
+       stxvd2x         vs4,    o0,     BBO
+       stxvd2x         vs5,    o16,    BBO
+       addic.          T1,     T1,     -1
+       addi            BBO,    BBO,    32
+
+       bge             ZGEMM_L2_COPYB
 
-.LZGEMM_L2_BEGIN:
 
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       1
        add             C,      C,      T1
        srawi.          I,      M,      3
-       ble             .LZGEMM_L2x8_END
+       ble             ZGEMM_L2x8_END
 
-.LZGEMM_L2x8_BEGIN:
+ZGEMM_L2x8_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L2x8_SUB0
+       ble             ZGEMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L2x8_SUB4
+       ble             ZGEMM_L2x8_SUB4
 
-.LZGEMM_L2x8_LOOP_START:
+ZGEMM_L2x8_LOOP_START:
 
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        LOAD2x8_1
        dcbt            AO,     PRE
        KERNEL2x8_I1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
 
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L2x8_LOOP_END
+       ble             ZGEMM_L2x8_LOOP_END
 
        .align 5
 
-.LZGEMM_L2x8_LOOP:
+ZGEMM_L2x8_LOOP:
 
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
 
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x8_LOOP
+       bgt             ZGEMM_L2x8_LOOP
 
-.LZGEMM_L2x8_LOOP_END:
+ZGEMM_L2x8_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
        dcbt            AO,     PRE
        KERNEL2x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL2x8_2
 
        dcbt            AO,     PRE
        KERNEL2x8_1
        KERNEL2x8_E2
 
-       b               .LZGEMM_L2x8_SUB1
+       b               ZGEMM_L2x8_SUB1
 
-.LZGEMM_L2x8_SUB4:
+ZGEMM_L2x8_SUB4:
 
        dcbt            AO,     PRE
        KERNEL2x8_SUBI1
        KERNEL2x8_SUB1
        KERNEL2x8_SUB1
 
-       b               .LZGEMM_L2x8_SUB1
+       b               ZGEMM_L2x8_SUB1
 
-.LZGEMM_L2x8_SUB0:
+ZGEMM_L2x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L2x8_SAVE
-       b               .LZGEMM_L2x8_SUB2
+       ble             ZGEMM_L2x8_SAVE
+       b               ZGEMM_L2x8_SUB2
 
-.LZGEMM_L2x8_SUB1:
+ZGEMM_L2x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L2x8_SAVE
+       ble             ZGEMM_L2x8_SAVE
 
-.LZGEMM_L2x8_SUB2:
+ZGEMM_L2x8_SUB2:
 
        KERNEL2x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x8_SUB2
+       bgt             ZGEMM_L2x8_SUB2
 
-.LZGEMM_L2x8_SAVE:
+ZGEMM_L2x8_SAVE:
 
        SAVE2x8
 
        addic.          I,      I,      -1
-       bgt             .LZGEMM_L2x8_BEGIN
+       bgt             ZGEMM_L2x8_BEGIN
 
-.LZGEMM_L2x8_END:
+ZGEMM_L2x8_END:
 
-.LZGEMM_L2x4_BEGIN:
+ZGEMM_L2x4_BEGIN:
 
        andi.           T2,     M,      7
-       ble             .LZGEMM_L2x1_END
+       ble             ZGEMM_L2x1_END
 
        andi.           T1,     M,      4
-       ble             .LZGEMM_L2x4_END
-       mr              BO,     B
+       ble             ZGEMM_L2x4_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L2x4_SUB0
+       ble             ZGEMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L2x4_SUB4
+       ble             ZGEMM_L2x4_SUB4
 
-.LZGEMM_L2x4_LOOP_START:
+ZGEMM_L2x4_LOOP_START:
 
        LOAD2x4_1
        KERNEL2x4_I1
        KERNEL2x4_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L2x4_LOOP_END
+       ble             ZGEMM_L2x4_LOOP_END
 
        .align 5
 
-.LZGEMM_L2x4_LOOP:
+ZGEMM_L2x4_LOOP:
 
        KERNEL2x4_1
        KERNEL2x4_2
        KERNEL2x4_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x4_LOOP
+       bgt             ZGEMM_L2x4_LOOP
 
-.LZGEMM_L2x4_LOOP_END:
+ZGEMM_L2x4_LOOP_END:
 
        KERNEL2x4_1
        KERNEL2x4_2
        KERNEL2x4_1
        KERNEL2x4_E2
 
-       b               .LZGEMM_L2x4_SUB1
+       b               ZGEMM_L2x4_SUB1
 
-.LZGEMM_L2x4_SUB4:
+ZGEMM_L2x4_SUB4:
 
        KERNEL2x4_SUBI1
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
 
-       b               .LZGEMM_L2x4_SUB1
+       b               ZGEMM_L2x4_SUB1
 
-.LZGEMM_L2x4_SUB0:
+ZGEMM_L2x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L2x4_SAVE
-       b               .LZGEMM_L2x4_SUB2
+       ble             ZGEMM_L2x4_SAVE
+       b               ZGEMM_L2x4_SUB2
 
-.LZGEMM_L2x4_SUB1:
+ZGEMM_L2x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L2x4_SAVE
+       ble             ZGEMM_L2x4_SAVE
 
-.LZGEMM_L2x4_SUB2:
+ZGEMM_L2x4_SUB2:
 
        KERNEL2x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x4_SUB2
+       bgt             ZGEMM_L2x4_SUB2
 
-.LZGEMM_L2x4_SAVE:
+ZGEMM_L2x4_SAVE:
 
        SAVE2x4
 
-.LZGEMM_L2x4_END:
+ZGEMM_L2x4_END:
 
-.LZGEMM_L2x2_BEGIN:
+ZGEMM_L2x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LZGEMM_L2x2_END
-       mr              BO,     B
+       ble             ZGEMM_L2x2_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L2x2_SUB0
+       ble             ZGEMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L2x2_SUB4
+       ble             ZGEMM_L2x2_SUB4
 
-.LZGEMM_L2x2_LOOP_START:
+ZGEMM_L2x2_LOOP_START:
 
        LOAD2x2_1
        KERNEL2x2_I1
        KERNEL2x2_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L2x2_LOOP_END
+       ble             ZGEMM_L2x2_LOOP_END
 
        .align 5
 
-.LZGEMM_L2x2_LOOP:
+ZGEMM_L2x2_LOOP:
 
        KERNEL2x2_1
        KERNEL2x2_2
        KERNEL2x2_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x2_LOOP
+       bgt             ZGEMM_L2x2_LOOP
 
-.LZGEMM_L2x2_LOOP_END:
+ZGEMM_L2x2_LOOP_END:
 
        KERNEL2x2_1
        KERNEL2x2_2
        KERNEL2x2_1
        KERNEL2x2_E2
 
-       b               .LZGEMM_L2x2_SUB1
+       b               ZGEMM_L2x2_SUB1
 
-.LZGEMM_L2x2_SUB4:
+ZGEMM_L2x2_SUB4:
 
        KERNEL2x2_SUBI1
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
 
-       b               .LZGEMM_L2x2_SUB1
+       b               ZGEMM_L2x2_SUB1
 
-.LZGEMM_L2x2_SUB0:
+ZGEMM_L2x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L2x2_SAVE
-       b               .LZGEMM_L2x2_SUB2
+       ble             ZGEMM_L2x2_SAVE
+       b               ZGEMM_L2x2_SUB2
 
-.LZGEMM_L2x2_SUB1:
+ZGEMM_L2x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L2x2_SAVE
+       ble             ZGEMM_L2x2_SAVE
 
-.LZGEMM_L2x2_SUB2:
+ZGEMM_L2x2_SUB2:
 
        KERNEL2x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x2_SUB2
+       bgt             ZGEMM_L2x2_SUB2
 
-.LZGEMM_L2x2_SAVE:
+ZGEMM_L2x2_SAVE:
 
        SAVE2x2
 
-.LZGEMM_L2x2_END:
+ZGEMM_L2x2_END:
 
-.LZGEMM_L2x1_BEGIN:
+ZGEMM_L2x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LZGEMM_L2x1_END
-       mr              BO,     B
+       ble             ZGEMM_L2x1_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L2x1_SUB0
+       ble             ZGEMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L2x1_SUB4
+       ble             ZGEMM_L2x1_SUB4
 
-.LZGEMM_L2x1_LOOP_START:
+ZGEMM_L2x1_LOOP_START:
 
        LOAD2x1_1
        KERNEL2x1_I1
        KERNEL2x1_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L2x1_LOOP_END
+       ble             ZGEMM_L2x1_LOOP_END
 
        .align 5
 
-.LZGEMM_L2x1_LOOP:
+ZGEMM_L2x1_LOOP:
 
        KERNEL2x1_1
        KERNEL2x1_2
        KERNEL2x1_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x1_LOOP
+       bgt             ZGEMM_L2x1_LOOP
 
-.LZGEMM_L2x1_LOOP_END:
+ZGEMM_L2x1_LOOP_END:
 
        KERNEL2x1_1
        KERNEL2x1_2
        KERNEL2x1_1
        KERNEL2x1_E2
 
-       b               .LZGEMM_L2x1_SUB1
+       b               ZGEMM_L2x1_SUB1
 
-.LZGEMM_L2x1_SUB4:
+ZGEMM_L2x1_SUB4:
 
        KERNEL2x1_SUBI1
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
 
-       b               .LZGEMM_L2x1_SUB1
+       b               ZGEMM_L2x1_SUB1
 
-.LZGEMM_L2x1_SUB0:
+ZGEMM_L2x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L2x1_SAVE
-       b               .LZGEMM_L2x1_SUB2
+       ble             ZGEMM_L2x1_SAVE
+       b               ZGEMM_L2x1_SUB2
 
-.LZGEMM_L2x1_SUB1:
+ZGEMM_L2x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L2x1_SAVE
+       ble             ZGEMM_L2x1_SAVE
 
-.LZGEMM_L2x1_SUB2:
+ZGEMM_L2x1_SUB2:
 
        KERNEL2x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L2x1_SUB2
+       bgt             ZGEMM_L2x1_SUB2
 
-.LZGEMM_L2x1_SAVE:
+ZGEMM_L2x1_SAVE:
 
        SAVE2x1
 
-.LZGEMM_L2x1_END:
+ZGEMM_L2x1_END:
 
        slwi            T1,     K,      5
        add             B,      B,      T1
 
        addic.          J,      J,      -1
-       bgt             .LZGEMM_L2_BEGIN
+       bgt             ZGEMM_L2_BEGIN
 
        andi.           T2,     N,      1
-       ble             .L999
+       ble             L999
 
-.LZGEMM_L2_END:
+ZGEMM_L2_END:
 
-       b               .LZGEMM_L1_BEGIN
+       b               ZGEMM_L1_BEGIN
 
-.L999_H1:
+L999_H1:
 
-       b               .L999
+       b               L999
+
+ZGEMM_L1_BEGIN:
+
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      0
+
+ZGEMM_L1_COPYB:
+
+       lxvdsx          vs4,    o0,     BO              // b0_r
+       lxvdsx          vs5,    o8,     BO              // b0_i
+       addi            BO,     BO,     16
+       stxvd2x         vs4,    o0,     BBO
+       stxvd2x         vs5,    o16,    BBO
+       addic.          T1,     T1,     -1
+       addi            BBO,    BBO,    32
+
+       bge             ZGEMM_L1_COPYB
 
-.LZGEMM_L1_BEGIN:
 
        andi.           T1,     N,      1
-       ble             .LZGEMM_L1_END
+       ble             ZGEMM_L1_END
        mr              CO,     C
        mr              AO,     A
        srawi.          I,      M,      3
-       ble             .LZGEMM_L1x8_END
+       ble             ZGEMM_L1x8_END
 
-.LZGEMM_L1x8_BEGIN:
+ZGEMM_L1x8_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L1x8_SUB0
+       ble             ZGEMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L1x8_SUB4
+       ble             ZGEMM_L1x8_SUB4
 
-.LZGEMM_L1x8_LOOP_START:
+ZGEMM_L1x8_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD1x8_1
        KERNEL1x8_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L1x8_LOOP_END
+       ble             ZGEMM_L1x8_LOOP_END
 
        .align 5
 
-.LZGEMM_L1x8_LOOP:
+ZGEMM_L1x8_LOOP:
 
        dcbt            AO,     PRE
        KERNEL1x8_1
        KERNEL1x8_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x8_LOOP
+       bgt             ZGEMM_L1x8_LOOP
 
-.LZGEMM_L1x8_LOOP_END:
+ZGEMM_L1x8_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL1x8_1
        KERNEL1x8_1
        KERNEL1x8_E2
 
-       b               .LZGEMM_L1x8_SUB1
+       b               ZGEMM_L1x8_SUB1
 
-.LZGEMM_L1x8_SUB4:
+ZGEMM_L1x8_SUB4:
 
        dcbt            AO,     PRE
        KERNEL1x8_SUBI1
        KERNEL1x8_SUB1
        KERNEL1x8_SUB1
 
-       b               .LZGEMM_L1x8_SUB1
+       b               ZGEMM_L1x8_SUB1
 
-.LZGEMM_L1x8_SUB0:
+ZGEMM_L1x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L1x8_SAVE
-       b               .LZGEMM_L1x8_SUB2
+       ble             ZGEMM_L1x8_SAVE
+       b               ZGEMM_L1x8_SUB2
 
-.LZGEMM_L1x8_SUB1:
+ZGEMM_L1x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L1x8_SAVE
+       ble             ZGEMM_L1x8_SAVE
 
-.LZGEMM_L1x8_SUB2:
+ZGEMM_L1x8_SUB2:
 
        KERNEL1x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x8_SUB2
+       bgt             ZGEMM_L1x8_SUB2
 
-.LZGEMM_L1x8_SAVE:
+ZGEMM_L1x8_SAVE:
 
        SAVE1x8
 
        addic.          I,      I,      -1
-       bgt             .LZGEMM_L1x8_BEGIN
+       bgt             ZGEMM_L1x8_BEGIN
 
-.LZGEMM_L1x8_END:
+ZGEMM_L1x8_END:
 
-.LZGEMM_L1x4_BEGIN:
+ZGEMM_L1x4_BEGIN:
 
        andi.           T2,     M,      7
-       ble             .LZGEMM_L1x1_END
+       ble             ZGEMM_L1x1_END
 
        andi.           T1,     M,      4
-       ble             .LZGEMM_L1x4_END
-       mr              BO,     B
+       ble             ZGEMM_L1x4_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L1x4_SUB0
+       ble             ZGEMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L1x4_SUB4
+       ble             ZGEMM_L1x4_SUB4
 
-.LZGEMM_L1x4_LOOP_START:
+ZGEMM_L1x4_LOOP_START:
 
        LOAD1x4_1
        KERNEL1x4_I1
        KERNEL1x4_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L1x4_LOOP_END
+       ble             ZGEMM_L1x4_LOOP_END
 
        .align 5
 
-.LZGEMM_L1x4_LOOP:
+ZGEMM_L1x4_LOOP:
 
        KERNEL1x4_1
        KERNEL1x4_2
        KERNEL1x4_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x4_LOOP
+       bgt             ZGEMM_L1x4_LOOP
 
-.LZGEMM_L1x4_LOOP_END:
+ZGEMM_L1x4_LOOP_END:
 
        KERNEL1x4_1
        KERNEL1x4_2
        KERNEL1x4_1
        KERNEL1x4_E2
 
-       b               .LZGEMM_L1x4_SUB1
+       b               ZGEMM_L1x4_SUB1
 
-.LZGEMM_L1x4_SUB4:
+ZGEMM_L1x4_SUB4:
 
        KERNEL1x4_SUBI1
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
 
-       b               .LZGEMM_L1x4_SUB1
+       b               ZGEMM_L1x4_SUB1
 
-.LZGEMM_L1x4_SUB0:
+ZGEMM_L1x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L1x4_SAVE
-       b               .LZGEMM_L1x4_SUB2
+       ble             ZGEMM_L1x4_SAVE
+       b               ZGEMM_L1x4_SUB2
 
-.LZGEMM_L1x4_SUB1:
+ZGEMM_L1x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L1x4_SAVE
+       ble             ZGEMM_L1x4_SAVE
 
-.LZGEMM_L1x4_SUB2:
+ZGEMM_L1x4_SUB2:
 
        KERNEL1x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x4_SUB2
+       bgt             ZGEMM_L1x4_SUB2
 
-.LZGEMM_L1x4_SAVE:
+ZGEMM_L1x4_SAVE:
 
        SAVE1x4
 
-.LZGEMM_L1x4_END:
+ZGEMM_L1x4_END:
 
-.LZGEMM_L1x2_BEGIN:
+ZGEMM_L1x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LZGEMM_L1x2_END
-       mr              BO,     B
+       ble             ZGEMM_L1x2_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L1x2_SUB0
+       ble             ZGEMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L1x2_SUB4
+       ble             ZGEMM_L1x2_SUB4
 
-.LZGEMM_L1x2_LOOP_START:
+ZGEMM_L1x2_LOOP_START:
 
        LOAD1x2_1
        KERNEL1x2_I1
        KERNEL1x2_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L1x2_LOOP_END
+       ble             ZGEMM_L1x2_LOOP_END
 
        .align 5
 
-.LZGEMM_L1x2_LOOP:
+ZGEMM_L1x2_LOOP:
 
        KERNEL1x2_1
        KERNEL1x2_2
        KERNEL1x2_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x2_LOOP
+       bgt             ZGEMM_L1x2_LOOP
 
-.LZGEMM_L1x2_LOOP_END:
+ZGEMM_L1x2_LOOP_END:
 
        KERNEL1x2_1
        KERNEL1x2_2
        KERNEL1x2_1
        KERNEL1x2_E2
 
-       b               .LZGEMM_L1x2_SUB1
+       b               ZGEMM_L1x2_SUB1
 
-.LZGEMM_L1x2_SUB4:
+ZGEMM_L1x2_SUB4:
 
        KERNEL1x2_SUBI1
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
 
-       b               .LZGEMM_L1x2_SUB1
+       b               ZGEMM_L1x2_SUB1
 
-.LZGEMM_L1x2_SUB0:
+ZGEMM_L1x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L1x2_SAVE
-       b               .LZGEMM_L1x2_SUB2
+       ble             ZGEMM_L1x2_SAVE
+       b               ZGEMM_L1x2_SUB2
 
-.LZGEMM_L1x2_SUB1:
+ZGEMM_L1x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L1x2_SAVE
+       ble             ZGEMM_L1x2_SAVE
 
-.LZGEMM_L1x2_SUB2:
+ZGEMM_L1x2_SUB2:
 
        KERNEL1x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x2_SUB2
+       bgt             ZGEMM_L1x2_SUB2
 
-.LZGEMM_L1x2_SAVE:
+ZGEMM_L1x2_SAVE:
 
        SAVE1x2
 
-.LZGEMM_L1x2_END:
+ZGEMM_L1x2_END:
 
-.LZGEMM_L1x1_BEGIN:
+ZGEMM_L1x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LZGEMM_L1x1_END
-       mr              BO,     B
+       ble             ZGEMM_L1x1_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LZGEMM_L1x1_SUB0
+       ble             ZGEMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LZGEMM_L1x1_SUB4
+       ble             ZGEMM_L1x1_SUB4
 
-.LZGEMM_L1x1_LOOP_START:
+ZGEMM_L1x1_LOOP_START:
 
        LOAD1x1_1
        KERNEL1x1_I1
        KERNEL1x1_2
 
        addic.          L,      L,      -2
-       ble             .LZGEMM_L1x1_LOOP_END
+       ble             ZGEMM_L1x1_LOOP_END
 
        .align 5
 
-.LZGEMM_L1x1_LOOP:
+ZGEMM_L1x1_LOOP:
 
        KERNEL1x1_1
        KERNEL1x1_2
        KERNEL1x1_2
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x1_LOOP
+       bgt             ZGEMM_L1x1_LOOP
 
-.LZGEMM_L1x1_LOOP_END:
+ZGEMM_L1x1_LOOP_END:
 
        KERNEL1x1_1
        KERNEL1x1_2
        KERNEL1x1_1
        KERNEL1x1_E2
 
-       b               .LZGEMM_L1x1_SUB1
+       b               ZGEMM_L1x1_SUB1
 
-.LZGEMM_L1x1_SUB4:
+ZGEMM_L1x1_SUB4:
 
        KERNEL1x1_SUBI1
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
 
-       b               .LZGEMM_L1x1_SUB1
+       b               ZGEMM_L1x1_SUB1
 
-.LZGEMM_L1x1_SUB0:
+ZGEMM_L1x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LZGEMM_L1x1_SAVE
-       b               .LZGEMM_L1x1_SUB2
+       ble             ZGEMM_L1x1_SAVE
+       b               ZGEMM_L1x1_SUB2
 
-.LZGEMM_L1x1_SUB1:
+ZGEMM_L1x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LZGEMM_L1x1_SAVE
+       ble             ZGEMM_L1x1_SAVE
 
-.LZGEMM_L1x1_SUB2:
+ZGEMM_L1x1_SUB2:
 
        KERNEL1x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LZGEMM_L1x1_SUB2
+       bgt             ZGEMM_L1x1_SUB2
 
-.LZGEMM_L1x1_SAVE:
+ZGEMM_L1x1_SAVE:
 
        SAVE1x1
 
-.LZGEMM_L1x1_END:
+ZGEMM_L1x1_END:
 
-.LZGEMM_L1_END:
+ZGEMM_L1_END:
index 701ec65..a0fbb2e 100644 (file)
@@ -1,39 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-*       BLASTEST               : OK
-*       CTEST                  : OK
-*       TEST                   : OK
-*       LAPACK-TEST            : OK
-**************************************************************************************/
-
-
 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
        #define XSFADD_R1       xsadddp
@@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x8_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
        lxvd2x          vs1,    o16,    AO              // load real,imag from A
@@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL2x8_1
 
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
+
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
        xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
        xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
-
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
-
        xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
        xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
        xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
        xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
-
-       lxvd2x          vs8,    o0,     AO              // load real,imag from A
-       lxvd2x          vs9,    o16,    AO              // load real,imag from A
-
        xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
        xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
        xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
        xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
-
-       lxvd2x          vs10,   o32,    AO              // load real,imag from A
-       lxvd2x          vs11,   o48,    AO              // load real,imag from A
-
        xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
        xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
-
-       addi            AO,     AO,     64
-
        xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
        xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
 
@@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
        xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
        xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
-
-       lxvd2x          vs12,   o0,     AO              // load real,imag from A
-       lxvd2x          vs13,   o16,    AO              // load real,imag from A
-
        xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
        xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
        xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
        xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
-
-       lxvd2x          vs14,   o32,    AO              // load real,imag from A
-       lxvd2x          vs15,   o48,    AO              // load real,imag from A
-
        xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
        xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
        xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
        xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
-
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-
        xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
        xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
        xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
        xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
 
-       addi            AO,     AO,     64
-       addi            BO,     BO,     32
 
 .endm
 
 .macro KERNEL2x8_2
 
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
+
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
        xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
        xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
-
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-
        xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
        xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
        xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
        xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
-
-       lxvd2x          vs0,    o0,     AO              // load real,imag from A
-       lxvd2x          vs1,    o16,    AO              // load real,imag from A
-
        xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
        xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
        xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
        xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
-
-       lxvd2x          vs2,    o32,    AO              // load real,imag from A
-       lxvd2x          vs3,    o48,    AO              // load real,imag from A
-
        xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
        xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
        xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
        xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
 
-       addi            AO,     AO,     64
-
        xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
        xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
        xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
        xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
-
-       lxvd2x          vs4,    o0,     AO              // load real,imag from A
-       lxvd2x          vs5,    o16,    AO              // load real,imag from A
-
        xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
        xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
        xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
        xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
-
-       lxvd2x          vs6,    o32,    AO              // load real,imag from A
-       lxvd2x          vs7,    o48,    AO              // load real,imag from A
-
        xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
        xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
        xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
        xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
-
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
-
        xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
        xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
        xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
        xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
 
-       addi            AO,     AO,     64
-       addi            BO,     BO,     32
 
 .endm
 
@@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x4_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
        lxvd2x          vs1,    o16,    AO              // load real,imag from A
@@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x2_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
        lxvd2x          vs1,    o16,    AO              // load real,imag from A
@@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x1_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
 
@@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
-       lxvdsx          vs22,   o16,    BO              // load real part from B
-       lxvdsx          vs23,   o24,    BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
+       lxvd2x          vs22,   o32,    BO              // load real part from B
+       lxvd2x          vs23,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
-       lxvdsx          vs18,   o16,    BO              // load real part from B
-       lxvdsx          vs19,   o24,    BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
+       lxvd2x          vs18,   o32,    BO              // load real part from B
+       lxvd2x          vs19,   o48,    BO              // load imag part from B
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     64
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x8_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
        lxvd2x          vs1,    o16,    AO              // load real,imag from A
@@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x4_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
        lxvd2x          vs1,    o16,    AO              // load real,imag from A
@@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x2_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
        lxvd2x          vs1,    o16,    AO              // load real,imag from A
@@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x1_1
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        lxvd2x          vs0,    o0,     AO              // load real,imag from A
 
@@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs20,   o0,     BO              // load real part from B
-       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvd2x          vs20,   o0,     BO              // load real part from B
+       lxvd2x          vs21,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
        xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
@@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
        xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
@@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvdsx          vs16,   o0,     BO              // load real part from B
-       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvd2x          vs16,   o0,     BO              // load real part from B
+       lxvd2x          vs17,   o16,    BO              // load imag part from B
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     32
 
        xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
        xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
index 8b95376..0cfe613 100644 (file)
@@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#include "zgemm_macros_8x2_power8.S"
+#include "ztrmm_macros_8x2_power8.S"
 
        cmpwi   cr0, M, 0
        ble     .L999
diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S
new file mode 100644 (file)
index 0000000..701ec65
--- /dev/null
@@ -0,0 +1,3110 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xssubdp
+       #define XSFADD_I1       xsadddp
+       #define XSFADD_I2       xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xsadddp
+       #define XSFADD_I1       xssubdp
+       #define XSFADD_I2       xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xsadddp
+       #define XSFADD_I1       xsadddp
+       #define XSFADD_I2       xssubdp
+
+#else          // CC || CR || RC || RR
+
+       #define XSFADD_R1       xsadddp
+       #define XSFADD_R2       xssubdp
+       #define XSFADD_I1       xssubdp
+       #define XSFADD_I2       xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs48,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs50,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs51,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs52,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs54,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs55,   vs3,    vs19            // real*imag, imag*imag
+       xvmuldp         vs56,   vs4,    vs18            // real*real, imag*real
+       xvmuldp         vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmuldp         vs58,   vs5,    vs18            // real*real, imag*real
+       xvmuldp         vs59,   vs5,    vs19            // real*imag, imag*imag
+       xvmuldp         vs60,   vs6,    vs18            // real*real, imag*real
+       xvmuldp         vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmuldp         vs62,   vs7,    vs18            // real*real, imag*real
+       xvmuldp         vs63,   vs7,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs48,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
+
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
+       xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
+       xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
+       xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
+       xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+
+       xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+
+       xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
+
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
+       xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
+       xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
+
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
+       xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
+       xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
+
+       addi            AO,     AO,     64
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
+       xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
+       xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
+       xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
+       xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs48,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs50,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs51,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs52,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs54,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs55,   vs3,    vs19            // real*imag, imag*imag
+       xvmuldp         vs56,   vs4,    vs18            // real*real, imag*real
+       xvmuldp         vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmuldp         vs58,   vs5,    vs18            // real*real, imag*real
+       xvmuldp         vs59,   vs5,    vs19            // real*imag, imag*imag
+       xvmuldp         vs60,   vs6,    vs18            // real*real, imag*real
+       xvmuldp         vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmuldp         vs62,   vs7,    vs18            // real*real, imag*real
+       xvmuldp         vs63,   vs7,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs48,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
+       xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
+       xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
+       xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
+       xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+       lxvd2x          vs20,   o0,     T2
+       lxvd2x          vs21,   o16,    T2
+       lxvd2x          vs22,   o32,    T2
+       lxvd2x          vs23,   o48,    T2
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
+
+       xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
+
+       xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
+
+       xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
+
+       xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+       xvadddp         vs12,   vs12,   vs20
+       xvadddp         vs13,   vs13,   vs21
+       xvadddp         vs14,   vs14,   vs22
+       xvadddp         vs15,   vs15,   vs23
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+       stxvd2x         vs12,   o0,     T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+       lxvd2x          vs20,   o0,     T2
+       lxvd2x          vs21,   o16,    T2
+       lxvd2x          vs22,   o32,    T2
+       lxvd2x          vs23,   o48,    T2
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs49,   vs49                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs48            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs49            // imagA*imagB
+
+       xxswapd         vs48,   vs48                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs49,   vs49                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs48            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs49            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs51,   vs51                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs50            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs51            // imagA*imagB
+
+       xxswapd         vs50,   vs50                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs51,   vs51                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs50            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs51            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs53,   vs53                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs52            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs53            // imagA*imagB
+
+       xxswapd         vs52,   vs52                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs53,   vs53                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs52            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs53            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs55,   vs55                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs54            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs55            // imagA*imagB
+
+       xxswapd         vs54,   vs54                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs55,   vs55                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs54            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs55            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs57,   vs57                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs56            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs57            // imagA*imagB
+
+       xxswapd         vs56,   vs56                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs57,   vs57                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs56            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs57            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs59,   vs59                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs58            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs59            // imagA*imagB
+
+       xxswapd         vs58,   vs58                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs59,   vs59                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs58            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs59            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs61,   vs61                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs60            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs61            // imagA*imagB
+
+       xxswapd         vs60,   vs60                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs61,   vs61                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs60            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs61            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs63,   vs63                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs62            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs63            // imagA*imagB
+
+       xxswapd         vs62,   vs62                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs63,   vs63                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs62            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs63            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+       xvadddp         vs12,   vs12,   vs20
+       xvadddp         vs13,   vs13,   vs21
+       xvadddp         vs14,   vs14,   vs22
+       xvadddp         vs15,   vs15,   vs23
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+       stxvd2x         vs12,   o0,     T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+       addi            CO,     CO,     128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs40,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs42,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs44,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs46,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs41,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs43,   vs9,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs45,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs47,   vs11,   vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs41,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs43,   vs9,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs10,   vs22            // real*real, imag*real
+       xvmaddadp       vs45,   vs10,   vs23            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs11,   vs22            // real*real, imag*real
+       xvmaddadp       vs47,   vs11,   vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs40,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs42,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmuldp         vs44,   vs2,    vs18            // real*real, imag*real
+       xvmuldp         vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmuldp         vs46,   vs3,    vs18            // real*real, imag*real
+       xvmuldp         vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs40,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs41,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs43,   vs1,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs2,    vs18            // real*real, imag*real
+       xvmaddadp       vs45,   vs2,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs3,    vs18            // real*real, imag*real
+       xvmaddadp       vs47,   vs3,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
+
+       xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
+
+       xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
+
+       xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
+
+       xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs36,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs38,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs37,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs39,   vs9,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs37,   vs8,    vs23            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs9,    vs22            // real*real, imag*real
+       xvmaddadp       vs39,   vs9,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs36,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmuldp         vs38,   vs1,    vs18            // real*real, imag*real
+       xvmuldp         vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs36,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs37,   vs0,    vs19            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs1,    vs18            // real*real, imag*real
+       xvmaddadp       vs39,   vs1,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs34,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+       lxvdsx          vs22,   o16,    BO              // load real part from B
+       lxvdsx          vs23,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs35,   vs8,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs8,    vs22            // real*real, imag*real
+       xvmaddadp       vs35,   vs8,    vs23            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmuldp         vs34,   vs0,    vs18            // real*real, imag*real
+       xvmuldp         vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+       lxvdsx          vs18,   o16,    BO              // load real part from B
+       lxvdsx          vs19,   o24,    BO              // load imag part from B
+
+       addi            BO,     BO,     32
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+       xvmaddadp       vs34,   vs0,    vs18            // real*real, imag*real
+       xvmaddadp       vs35,   vs0,    vs19            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs12,   o0,     AO              // load real,imag from A
+       lxvd2x          vs13,   o16,    AO              // load real,imag from A
+       lxvd2x          vs14,   o32,    AO              // load real,imag from A
+       lxvd2x          vs15,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
+       xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
+       xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
+       xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
+       xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
+       xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
+       xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
+       xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
+       xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvd2x          vs4,    o0,     AO              // load real,imag from A
+       lxvd2x          vs5,    o16,    AO              // load real,imag from A
+       lxvd2x          vs6,    o32,    AO              // load real,imag from A
+       lxvd2x          vs7,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
+       xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
+       xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
+       xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
+       xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+       mr              T1,     CO
+       addi            T2,     T1,     64
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+       lxvd2x          vs20,   o0,     T2
+       lxvd2x          vs21,   o16,    T2
+       lxvd2x          vs22,   o32,    T2
+       lxvd2x          vs23,   o48,    T2
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
+
+       xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
+
+       xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
+
+       xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
+
+       xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+       xvadddp         vs12,   vs12,   vs20
+       xvadddp         vs13,   vs13,   vs21
+       xvadddp         vs14,   vs14,   vs22
+       xvadddp         vs15,   vs15,   vs23
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+       stxvd2x         vs12,   o0,     T2
+       stxvd2x         vs13,   o16,    T2
+       stxvd2x         vs14,   o32,    T2
+       stxvd2x         vs15,   o48,    T2
+
+       add             T1,     T1,     LDC
+       add             T2,     T2,     LDC
+       addi            CO,     CO,     128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+       lxvd2x          vs10,   o32,    AO              // load real,imag from A
+       lxvd2x          vs11,   o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
+       xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
+       xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
+       xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
+       xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+       lxvd2x          vs2,    o32,    AO              // load real,imag from A
+       lxvd2x          vs3,    o48,    AO              // load real,imag from A
+
+       addi            AO,     AO,     64
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
+       xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
+       xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+       lxvd2x          vs18,   o32,    T1
+       lxvd2x          vs19,   o48,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
+
+       xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
+
+       xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+       xvadddp         vs10,   vs10,   vs18
+       xvadddp         vs11,   vs11,   vs19
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+       stxvd2x         vs10,   o32,    T1
+       stxvd2x         vs11,   o48,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+       lxvd2x          vs9,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
+       xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
+       xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+       lxvd2x          vs1,    o16,    AO              // load real,imag from A
+
+       addi            AO,     AO,     32
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+       xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
+       xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+       lxvd2x          vs17,   o16,    T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
+
+       xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+       xvadddp         vs9,    vs9,    vs17
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+       stxvd2x         vs9,    o16,    T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+       lxvd2x          vs8,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs20,   o0,     BO              // load real part from B
+       lxvdsx          vs21,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
+       xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
+       xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+       lxvd2x          vs0,    o0,     AO              // load real,imag from A
+
+       addi            AO,     AO,     16
+
+       lxvdsx          vs16,   o0,     BO              // load real part from B
+       lxvdsx          vs17,   o8,     BO              // load imag part from B
+
+       addi            BO,     BO,     16
+
+       xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
+       xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvd2x          vs16,   o0,     T1
+
+#endif
+
+
+       xxlxor          vs0,    vs0,    vs0
+       xxlxor          vs1,    vs1,    vs1
+       xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+       XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
+       XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
+
+       xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
+       xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+       XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
+       XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
+
+       xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
+       xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
+       xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
+       xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
+
+       xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
+       xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
+       xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+       xvadddp         vs8,    vs8,    vs16
+
+#endif
+
+       stxvd2x         vs8,    o0,     T1
+
+       add             T1,     T1,     LDC
+       addi            CO,     CO,     16
+
+.endm
+
diff --git a/param.h b/param.h
index 2efd9b2..a6ead4b 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1980,7 +1980,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
 #define CGEMM_DEFAULT_P  720
-#define ZGEMM_DEFAULT_P  240
+#define ZGEMM_DEFAULT_P  480
 
 #define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
@@ -1990,7 +1990,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_R 21600
 #define DGEMM_DEFAULT_R 14400
 #define CGEMM_DEFAULT_R 16200
-#define ZGEMM_DEFAULT_R 14400
+#define ZGEMM_DEFAULT_R 21600
 
 #define SYMV_P  8