updated sgemm- and strmm-kernel for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Sat, 2 Apr 2016 15:16:36 +0000 (17:16 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Sat, 2 Apr 2016 15:16:36 +0000 (17:16 +0200)
kernel/power/sgemm_kernel_16x8_power8.S
kernel/power/sgemm_logic_16x8_power8.S
kernel/power/sgemm_macros_16x8_power8.S
kernel/power/strmm_kernel_16x8_power8.S
kernel/power/strmm_logic_16x8_power8.S
kernel/power/strmm_macros_16x8_power8.S [new file with mode: 0644]
param.h

index 031f342..c2dc1f6 100644 (file)
@@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
 /*********************************************************************/
@@ -128,17 +128,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define alpha_r vs30
+#define alpha_vr vs31
 
 #define o0     0
 
-#define TBUFFER r14
+#define BBUFFER r14
 #define o4     r15
 #define o12    r16
 #define o8     r17
 #define L      r18
 #define T1     r19
 #define KK     r20
-#define BB     r21
+#define BBO    r21
 #define        I       r22
 #define J      r23
 #define AO     r24
@@ -256,11 +257,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        cmpwi   cr0, M, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, N, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, K, 0
-       ble     .L999_H1
+       ble     L999_H1
 
        li      PRE, 256 
        li      o4 , 4
@@ -269,18 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        li      o16, 16
        li      o32, 32
        li      o48, 48
-       addi    TBUFFER, SP, 320
+
+       li      T1, 256
+       slwi    T1, T1, 9               // 131072
+       sub     BBUFFER, A, T1          // temp buffer for B unrolled
 
         addi    T1, SP, 300
-        stfs    f1, 0(T1)
+        stxsspx    f1, o0 , T1
+        stxsspx    f1, o4 , T1
+        stxsspx    f1, o8 , T1
+        stxsspx    f1, o12 , T1
 
-        lxsspx  alpha_r, 0, T1
+       lxsspx     alpha_r,  o0, T1
+        lxvw4x     alpha_vr, o0, T1
 
 
 
 #include "sgemm_logic_16x8_power8.S"
 
-.L999:
+L999:
        addi    r3, 0, 0
 
        lfd     f14,    0(SP)
index 0ae6413..06bb79e 100644 (file)
@@ -26,94 +26,149 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
        srawi.          J,      N,      3
-       ble             .LSGEMM_L8_END
+       ble             SGEMM_L8_END
 
-.LSGEMM_L8_BEGIN:
+SGEMM_L8_BEGIN:
+
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      3
+
+SGEMM_L8_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             SGEMM_L8_COPYB
 
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       3
        add             C,      C,      T1
        srawi.          I,      M,      4
-       ble             .LSGEMM_L8x16_END
+       ble             SGEMM_L8x16_END
 
-.LSGEMM_L8x16_BEGIN:
+SGEMM_L8x16_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L8x16_SUB0
+       ble             SGEMM_L8x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L8x16_SUB4
+       ble             SGEMM_L8x16_SUB4
 
-.LSGEMM_L8x16_LOOP_START:
+SGEMM_L8x16_LOOP_START:
 
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        LOAD8x16_1
+       dcbt            BO,     PRE
        KERNEL8x16_I1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
 
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L8x16_LOOP_END
+       ble             SGEMM_L8x16_LOOP_END
 
        .align 5
 
-.LSGEMM_L8x16_LOOP:
+SGEMM_L8x16_LOOP:
 
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
 
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x16_LOOP
+       bgt             SGEMM_L8x16_LOOP
 
-.LSGEMM_L8x16_LOOP_END:
+SGEMM_L8x16_LOOP_END:
 
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
 
+       dcbt            BO,     PRE
        KERNEL8x16_1
+       dcbt            BO,     PRE
        dcbt            AO,     PRE
        KERNEL8x16_2
        KERNEL8x16_1
        KERNEL8x16_E2
 
-       b               .LSGEMM_L8x16_SUB1
+       b               SGEMM_L8x16_SUB1
 
-.LSGEMM_L8x16_SUB4:
+SGEMM_L8x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL8x16_SUBI1
@@ -127,53 +182,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x16_SUB1
        KERNEL8x16_SUB1
 
-       b               .LSGEMM_L8x16_SUB1
+       b               SGEMM_L8x16_SUB1
 
-.LSGEMM_L8x16_SUB0:
+SGEMM_L8x16_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL8x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L8x16_SAVE
-       b               .LSGEMM_L8x16_SUB2
+       ble             SGEMM_L8x16_SAVE
+       b               SGEMM_L8x16_SUB2
 
-.LSGEMM_L8x16_SUB1:
+SGEMM_L8x16_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L8x16_SAVE
+       ble             SGEMM_L8x16_SAVE
 
-.LSGEMM_L8x16_SUB2:
+SGEMM_L8x16_SUB2:
 
        KERNEL8x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x16_SUB2
+       bgt             SGEMM_L8x16_SUB2
 
-.LSGEMM_L8x16_SAVE:
+SGEMM_L8x16_SAVE:
 
        SAVE8x16
 
        addic.          I,      I,      -1
-       bgt             .LSGEMM_L8x16_BEGIN
+       bgt             SGEMM_L8x16_BEGIN
 
-.LSGEMM_L8x16_END:
+SGEMM_L8x16_END:
 
-.LSGEMM_L8x8_BEGIN:
+SGEMM_L8x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LSGEMM_L8x1_END
+       ble             SGEMM_L8x1_END
 
        andi.           T1,     M,      8
-       ble             .LSGEMM_L8x8_END
-       mr              BO,     B
+       ble             SGEMM_L8x8_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L8x8_SUB0
+       ble             SGEMM_L8x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L8x8_SUB4
+       ble             SGEMM_L8x8_SUB4
 
-.LSGEMM_L8x8_LOOP_START:
+SGEMM_L8x8_LOOP_START:
 
        LOAD8x8_1
        KERNEL8x8_I1
@@ -187,11 +242,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L8x8_LOOP_END
+       ble             SGEMM_L8x8_LOOP_END
 
        .align 5
 
-.LSGEMM_L8x8_LOOP:
+SGEMM_L8x8_LOOP:
 
        KERNEL8x8_1
        KERNEL8x8_2
@@ -204,9 +259,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x8_LOOP
+       bgt             SGEMM_L8x8_LOOP
 
-.LSGEMM_L8x8_LOOP_END:
+SGEMM_L8x8_LOOP_END:
 
        KERNEL8x8_1
        KERNEL8x8_2
@@ -218,9 +273,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_1
        KERNEL8x8_E2
 
-       b               .LSGEMM_L8x8_SUB1
+       b               SGEMM_L8x8_SUB1
 
-.LSGEMM_L8x8_SUB4:
+SGEMM_L8x8_SUB4:
 
        KERNEL8x8_SUBI1
        KERNEL8x8_SUB1
@@ -232,48 +287,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_SUB1
        KERNEL8x8_SUB1
 
-       b               .LSGEMM_L8x8_SUB1
+       b               SGEMM_L8x8_SUB1
 
-.LSGEMM_L8x8_SUB0:
+SGEMM_L8x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL8x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L8x8_SAVE
-       b               .LSGEMM_L8x8_SUB2
+       ble             SGEMM_L8x8_SAVE
+       b               SGEMM_L8x8_SUB2
 
-.LSGEMM_L8x8_SUB1:
+SGEMM_L8x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L8x8_SAVE
+       ble             SGEMM_L8x8_SAVE
 
-.LSGEMM_L8x8_SUB2:
+SGEMM_L8x8_SUB2:
 
        KERNEL8x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x8_SUB2
+       bgt             SGEMM_L8x8_SUB2
 
-.LSGEMM_L8x8_SAVE:
+SGEMM_L8x8_SAVE:
 
        SAVE8x8
 
-.LSGEMM_L8x8_END:
+SGEMM_L8x8_END:
 
-.LSGEMM_L8x4_BEGIN:
+SGEMM_L8x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LSGEMM_L8x4_END
-       mr              BO,     B
+       ble             SGEMM_L8x4_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L8x4_SUB0
+       ble             SGEMM_L8x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L8x4_SUB4
+       ble             SGEMM_L8x4_SUB4
 
-.LSGEMM_L8x4_LOOP_START:
+SGEMM_L8x4_LOOP_START:
 
        LOAD8x4_1
        KERNEL8x4_I1
@@ -287,11 +342,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L8x4_LOOP_END
+       ble             SGEMM_L8x4_LOOP_END
 
        .align 5
 
-.LSGEMM_L8x4_LOOP:
+SGEMM_L8x4_LOOP:
 
        KERNEL8x4_1
        KERNEL8x4_2
@@ -304,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x4_LOOP
+       bgt             SGEMM_L8x4_LOOP
 
-.LSGEMM_L8x4_LOOP_END:
+SGEMM_L8x4_LOOP_END:
 
        KERNEL8x4_1
        KERNEL8x4_2
@@ -318,9 +373,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_1
        KERNEL8x4_E2
 
-       b               .LSGEMM_L8x4_SUB1
+       b               SGEMM_L8x4_SUB1
 
-.LSGEMM_L8x4_SUB4:
+SGEMM_L8x4_SUB4:
 
        KERNEL8x4_SUBI1
        KERNEL8x4_SUB1
@@ -332,48 +387,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_SUB1
        KERNEL8x4_SUB1
 
-       b               .LSGEMM_L8x4_SUB1
+       b               SGEMM_L8x4_SUB1
 
-.LSGEMM_L8x4_SUB0:
+SGEMM_L8x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL8x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L8x4_SAVE
-       b               .LSGEMM_L8x4_SUB2
+       ble             SGEMM_L8x4_SAVE
+       b               SGEMM_L8x4_SUB2
 
-.LSGEMM_L8x4_SUB1:
+SGEMM_L8x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L8x4_SAVE
+       ble             SGEMM_L8x4_SAVE
 
-.LSGEMM_L8x4_SUB2:
+SGEMM_L8x4_SUB2:
 
        KERNEL8x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x4_SUB2
+       bgt             SGEMM_L8x4_SUB2
 
-.LSGEMM_L8x4_SAVE:
+SGEMM_L8x4_SAVE:
 
        SAVE8x4
 
-.LSGEMM_L8x4_END:
+SGEMM_L8x4_END:
 
-.LSGEMM_L8x2_BEGIN:
+SGEMM_L8x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LSGEMM_L8x2_END
-       mr              BO,     B
+       ble             SGEMM_L8x2_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L8x2_SUB0
+       ble             SGEMM_L8x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L8x2_SUB4
+       ble             SGEMM_L8x2_SUB4
 
-.LSGEMM_L8x2_LOOP_START:
+SGEMM_L8x2_LOOP_START:
 
        LOAD8x2_1
        KERNEL8x2_I1
@@ -387,11 +442,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L8x2_LOOP_END
+       ble             SGEMM_L8x2_LOOP_END
 
        .align 5
 
-.LSGEMM_L8x2_LOOP:
+SGEMM_L8x2_LOOP:
 
        KERNEL8x2_1
        KERNEL8x2_2
@@ -404,9 +459,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x2_LOOP
+       bgt             SGEMM_L8x2_LOOP
 
-.LSGEMM_L8x2_LOOP_END:
+SGEMM_L8x2_LOOP_END:
 
        KERNEL8x2_1
        KERNEL8x2_2
@@ -418,9 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_1
        KERNEL8x2_E2
 
-       b               .LSGEMM_L8x2_SUB1
+       b               SGEMM_L8x2_SUB1
 
-.LSGEMM_L8x2_SUB4:
+SGEMM_L8x2_SUB4:
 
        KERNEL8x2_SUBI1
        KERNEL8x2_SUB1
@@ -432,48 +487,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_SUB1
        KERNEL8x2_SUB1
 
-       b               .LSGEMM_L8x2_SUB1
+       b               SGEMM_L8x2_SUB1
 
-.LSGEMM_L8x2_SUB0:
+SGEMM_L8x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL8x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L8x2_SAVE
-       b               .LSGEMM_L8x2_SUB2
+       ble             SGEMM_L8x2_SAVE
+       b               SGEMM_L8x2_SUB2
 
-.LSGEMM_L8x2_SUB1:
+SGEMM_L8x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L8x2_SAVE
+       ble             SGEMM_L8x2_SAVE
 
-.LSGEMM_L8x2_SUB2:
+SGEMM_L8x2_SUB2:
 
        KERNEL8x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x2_SUB2
+       bgt             SGEMM_L8x2_SUB2
 
-.LSGEMM_L8x2_SAVE:
+SGEMM_L8x2_SAVE:
 
        SAVE8x2
 
-.LSGEMM_L8x2_END:
+SGEMM_L8x2_END:
 
-.LSGEMM_L8x1_BEGIN:
+SGEMM_L8x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LSGEMM_L8x1_END
-       mr              BO,     B
+       ble             SGEMM_L8x1_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L8x1_SUB0
+       ble             SGEMM_L8x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L8x1_SUB4
+       ble             SGEMM_L8x1_SUB4
 
-.LSGEMM_L8x1_LOOP_START:
+SGEMM_L8x1_LOOP_START:
 
        LOAD8x1_1
        KERNEL8x1_I1
@@ -487,11 +542,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L8x1_LOOP_END
+       ble             SGEMM_L8x1_LOOP_END
 
        .align 5
 
-.LSGEMM_L8x1_LOOP:
+SGEMM_L8x1_LOOP:
 
        KERNEL8x1_1
        KERNEL8x1_2
@@ -504,9 +559,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x1_LOOP
+       bgt             SGEMM_L8x1_LOOP
 
-.LSGEMM_L8x1_LOOP_END:
+SGEMM_L8x1_LOOP_END:
 
        KERNEL8x1_1
        KERNEL8x1_2
@@ -518,9 +573,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_1
        KERNEL8x1_E2
 
-       b               .LSGEMM_L8x1_SUB1
+       b               SGEMM_L8x1_SUB1
 
-.LSGEMM_L8x1_SUB4:
+SGEMM_L8x1_SUB4:
 
        KERNEL8x1_SUBI1
        KERNEL8x1_SUB1
@@ -532,74 +587,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_SUB1
        KERNEL8x1_SUB1
 
-       b               .LSGEMM_L8x1_SUB1
+       b               SGEMM_L8x1_SUB1
 
-.LSGEMM_L8x1_SUB0:
+SGEMM_L8x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL8x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L8x1_SAVE
-       b               .LSGEMM_L8x1_SUB2
+       ble             SGEMM_L8x1_SAVE
+       b               SGEMM_L8x1_SUB2
 
-.LSGEMM_L8x1_SUB1:
+SGEMM_L8x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L8x1_SAVE
+       ble             SGEMM_L8x1_SAVE
 
-.LSGEMM_L8x1_SUB2:
+SGEMM_L8x1_SUB2:
 
        KERNEL8x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L8x1_SUB2
+       bgt             SGEMM_L8x1_SUB2
 
-.LSGEMM_L8x1_SAVE:
+SGEMM_L8x1_SAVE:
 
        SAVE8x1
 
-.LSGEMM_L8x1_END:
+SGEMM_L8x1_END:
 
        slwi            T1,     K,      5
        add             B,      B,      T1
 
        addic.          J,      J,      -1
-       bgt             .LSGEMM_L8_BEGIN
+       bgt             SGEMM_L8_BEGIN
 
        andi.           T2,     N,      7
-       ble             .L999
+       ble             L999
+
+SGEMM_L8_END:
 
-.LSGEMM_L8_END:
+       b               SGEMM_L4_BEGIN
 
-       b               .LSGEMM_L4_BEGIN
+L999_H1:
 
-.L999_H1:
+       b               L999
 
-       b               .L999
+SGEMM_L4_BEGIN:
 
-.LSGEMM_L4_BEGIN:
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      2
+
+SGEMM_L4_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             SGEMM_L4_COPYB
 
        andi.           T1,     N,      4
-       ble             .LSGEMM_L4_END
+       ble             SGEMM_L4_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       2
        add             C,      C,      T1
        srawi.          I,      M,      4
-       ble             .LSGEMM_L4x16_END
+       ble             SGEMM_L4x16_END
 
-.LSGEMM_L4x16_BEGIN:
+SGEMM_L4x16_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L4x16_SUB0
+       ble             SGEMM_L4x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L4x16_SUB4
+       ble             SGEMM_L4x16_SUB4
 
-.LSGEMM_L4x16_LOOP_START:
+SGEMM_L4x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD4x16_1
@@ -618,11 +705,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L4x16_LOOP_END
+       ble             SGEMM_L4x16_LOOP_END
 
        .align 5
 
-.LSGEMM_L4x16_LOOP:
+SGEMM_L4x16_LOOP:
 
        KERNEL4x16_1
        dcbt            AO,     PRE
@@ -639,9 +726,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x16_LOOP
+       bgt             SGEMM_L4x16_LOOP
 
-.LSGEMM_L4x16_LOOP_END:
+SGEMM_L4x16_LOOP_END:
 
        KERNEL4x16_1
        dcbt            AO,     PRE
@@ -656,9 +743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_1
        KERNEL4x16_E2
 
-       b               .LSGEMM_L4x16_SUB1
+       b               SGEMM_L4x16_SUB1
 
-.LSGEMM_L4x16_SUB4:
+SGEMM_L4x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL4x16_SUBI1
@@ -672,53 +759,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_SUB1
        KERNEL4x16_SUB1
 
-       b               .LSGEMM_L4x16_SUB1
+       b               SGEMM_L4x16_SUB1
 
-.LSGEMM_L4x16_SUB0:
+SGEMM_L4x16_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L4x16_SAVE
-       b               .LSGEMM_L4x16_SUB2
+       ble             SGEMM_L4x16_SAVE
+       b               SGEMM_L4x16_SUB2
 
-.LSGEMM_L4x16_SUB1:
+SGEMM_L4x16_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L4x16_SAVE
+       ble             SGEMM_L4x16_SAVE
 
-.LSGEMM_L4x16_SUB2:
+SGEMM_L4x16_SUB2:
 
        KERNEL4x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x16_SUB2
+       bgt             SGEMM_L4x16_SUB2
 
-.LSGEMM_L4x16_SAVE:
+SGEMM_L4x16_SAVE:
 
        SAVE4x16
 
        addic.          I,      I,      -1
-       bgt             .LSGEMM_L4x16_BEGIN
+       bgt             SGEMM_L4x16_BEGIN
 
-.LSGEMM_L4x16_END:
+SGEMM_L4x16_END:
 
-.LSGEMM_L4x8_BEGIN:
+SGEMM_L4x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LSGEMM_L4x1_END
+       ble             SGEMM_L4x1_END
 
        andi.           T1,     M,      8
-       ble             .LSGEMM_L4x8_END
-       mr              BO,     B
+       ble             SGEMM_L4x8_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L4x8_SUB0
+       ble             SGEMM_L4x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L4x8_SUB4
+       ble             SGEMM_L4x8_SUB4
 
-.LSGEMM_L4x8_LOOP_START:
+SGEMM_L4x8_LOOP_START:
 
        LOAD4x8_1
        KERNEL4x8_I1
@@ -732,11 +819,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L4x8_LOOP_END
+       ble             SGEMM_L4x8_LOOP_END
 
        .align 5
 
-.LSGEMM_L4x8_LOOP:
+SGEMM_L4x8_LOOP:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -749,9 +836,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x8_LOOP
+       bgt             SGEMM_L4x8_LOOP
 
-.LSGEMM_L4x8_LOOP_END:
+SGEMM_L4x8_LOOP_END:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -763,9 +850,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_1
        KERNEL4x8_E2
 
-       b               .LSGEMM_L4x8_SUB1
+       b               SGEMM_L4x8_SUB1
 
-.LSGEMM_L4x8_SUB4:
+SGEMM_L4x8_SUB4:
 
        KERNEL4x8_SUBI1
        KERNEL4x8_SUB1
@@ -777,48 +864,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_SUB1
        KERNEL4x8_SUB1
 
-       b               .LSGEMM_L4x8_SUB1
+       b               SGEMM_L4x8_SUB1
 
-.LSGEMM_L4x8_SUB0:
+SGEMM_L4x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L4x8_SAVE
-       b               .LSGEMM_L4x8_SUB2
+       ble             SGEMM_L4x8_SAVE
+       b               SGEMM_L4x8_SUB2
 
-.LSGEMM_L4x8_SUB1:
+SGEMM_L4x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L4x8_SAVE
+       ble             SGEMM_L4x8_SAVE
 
-.LSGEMM_L4x8_SUB2:
+SGEMM_L4x8_SUB2:
 
        KERNEL4x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x8_SUB2
+       bgt             SGEMM_L4x8_SUB2
 
-.LSGEMM_L4x8_SAVE:
+SGEMM_L4x8_SAVE:
 
        SAVE4x8
 
-.LSGEMM_L4x8_END:
+SGEMM_L4x8_END:
 
-.LSGEMM_L4x4_BEGIN:
+SGEMM_L4x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LSGEMM_L4x4_END
-       mr              BO,     B
+       ble             SGEMM_L4x4_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L4x4_SUB0
+       ble             SGEMM_L4x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L4x4_SUB4
+       ble             SGEMM_L4x4_SUB4
 
-.LSGEMM_L4x4_LOOP_START:
+SGEMM_L4x4_LOOP_START:
 
        LOAD4x4_1
        KERNEL4x4_I1
@@ -832,11 +919,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L4x4_LOOP_END
+       ble             SGEMM_L4x4_LOOP_END
 
        .align 5
 
-.LSGEMM_L4x4_LOOP:
+SGEMM_L4x4_LOOP:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -849,9 +936,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x4_LOOP
+       bgt             SGEMM_L4x4_LOOP
 
-.LSGEMM_L4x4_LOOP_END:
+SGEMM_L4x4_LOOP_END:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -863,9 +950,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_1
        KERNEL4x4_E2
 
-       b               .LSGEMM_L4x4_SUB1
+       b               SGEMM_L4x4_SUB1
 
-.LSGEMM_L4x4_SUB4:
+SGEMM_L4x4_SUB4:
 
        KERNEL4x4_SUBI1
        KERNEL4x4_SUB1
@@ -877,48 +964,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_SUB1
        KERNEL4x4_SUB1
 
-       b               .LSGEMM_L4x4_SUB1
+       b               SGEMM_L4x4_SUB1
 
-.LSGEMM_L4x4_SUB0:
+SGEMM_L4x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L4x4_SAVE
-       b               .LSGEMM_L4x4_SUB2
+       ble             SGEMM_L4x4_SAVE
+       b               SGEMM_L4x4_SUB2
 
-.LSGEMM_L4x4_SUB1:
+SGEMM_L4x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L4x4_SAVE
+       ble             SGEMM_L4x4_SAVE
 
-.LSGEMM_L4x4_SUB2:
+SGEMM_L4x4_SUB2:
 
        KERNEL4x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x4_SUB2
+       bgt             SGEMM_L4x4_SUB2
 
-.LSGEMM_L4x4_SAVE:
+SGEMM_L4x4_SAVE:
 
        SAVE4x4
 
-.LSGEMM_L4x4_END:
+SGEMM_L4x4_END:
 
-.LSGEMM_L4x2_BEGIN:
+SGEMM_L4x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LSGEMM_L4x2_END
-       mr              BO,     B
+       ble             SGEMM_L4x2_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L4x2_SUB0
+       ble             SGEMM_L4x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L4x2_SUB4
+       ble             SGEMM_L4x2_SUB4
 
-.LSGEMM_L4x2_LOOP_START:
+SGEMM_L4x2_LOOP_START:
 
        LOAD4x2_1
        KERNEL4x2_I1
@@ -932,11 +1019,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L4x2_LOOP_END
+       ble             SGEMM_L4x2_LOOP_END
 
        .align 5
 
-.LSGEMM_L4x2_LOOP:
+SGEMM_L4x2_LOOP:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -949,9 +1036,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x2_LOOP
+       bgt             SGEMM_L4x2_LOOP
 
-.LSGEMM_L4x2_LOOP_END:
+SGEMM_L4x2_LOOP_END:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -963,9 +1050,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_1
        KERNEL4x2_E2
 
-       b               .LSGEMM_L4x2_SUB1
+       b               SGEMM_L4x2_SUB1
 
-.LSGEMM_L4x2_SUB4:
+SGEMM_L4x2_SUB4:
 
        KERNEL4x2_SUBI1
        KERNEL4x2_SUB1
@@ -977,48 +1064,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB1
        KERNEL4x2_SUB1
 
-       b               .LSGEMM_L4x2_SUB1
+       b               SGEMM_L4x2_SUB1
 
-.LSGEMM_L4x2_SUB0:
+SGEMM_L4x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L4x2_SAVE
-       b               .LSGEMM_L4x2_SUB2
+       ble             SGEMM_L4x2_SAVE
+       b               SGEMM_L4x2_SUB2
 
-.LSGEMM_L4x2_SUB1:
+SGEMM_L4x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L4x2_SAVE
+       ble             SGEMM_L4x2_SAVE
 
-.LSGEMM_L4x2_SUB2:
+SGEMM_L4x2_SUB2:
 
        KERNEL4x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x2_SUB2
+       bgt             SGEMM_L4x2_SUB2
 
-.LSGEMM_L4x2_SAVE:
+SGEMM_L4x2_SAVE:
 
        SAVE4x2
 
-.LSGEMM_L4x2_END:
+SGEMM_L4x2_END:
 
-.LSGEMM_L4x1_BEGIN:
+SGEMM_L4x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LSGEMM_L4x1_END
-       mr              BO,     B
+       ble             SGEMM_L4x1_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L4x1_SUB0
+       ble             SGEMM_L4x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L4x1_SUB4
+       ble             SGEMM_L4x1_SUB4
 
-.LSGEMM_L4x1_LOOP_START:
+SGEMM_L4x1_LOOP_START:
 
        LOAD4x1_1
        KERNEL4x1_I1
@@ -1032,11 +1119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L4x1_LOOP_END
+       ble             SGEMM_L4x1_LOOP_END
 
        .align 5
 
-.LSGEMM_L4x1_LOOP:
+SGEMM_L4x1_LOOP:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -1049,9 +1136,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x1_LOOP
+       bgt             SGEMM_L4x1_LOOP
 
-.LSGEMM_L4x1_LOOP_END:
+SGEMM_L4x1_LOOP_END:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -1063,9 +1150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_1
        KERNEL4x1_E2
 
-       b               .LSGEMM_L4x1_SUB1
+       b               SGEMM_L4x1_SUB1
 
-.LSGEMM_L4x1_SUB4:
+SGEMM_L4x1_SUB4:
 
        KERNEL4x1_SUBI1
        KERNEL4x1_SUB1
@@ -1077,61 +1164,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_SUB1
        KERNEL4x1_SUB1
 
-       b               .LSGEMM_L4x1_SUB1
+       b               SGEMM_L4x1_SUB1
 
-.LSGEMM_L4x1_SUB0:
+SGEMM_L4x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L4x1_SAVE
-       b               .LSGEMM_L4x1_SUB2
+       ble             SGEMM_L4x1_SAVE
+       b               SGEMM_L4x1_SUB2
 
-.LSGEMM_L4x1_SUB1:
+SGEMM_L4x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L4x1_SAVE
+       ble             SGEMM_L4x1_SAVE
 
-.LSGEMM_L4x1_SUB2:
+SGEMM_L4x1_SUB2:
 
        KERNEL4x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L4x1_SUB2
+       bgt             SGEMM_L4x1_SUB2
 
-.LSGEMM_L4x1_SAVE:
+SGEMM_L4x1_SAVE:
 
        SAVE4x1
 
-.LSGEMM_L4x1_END:
+SGEMM_L4x1_END:
 
        slwi            T1,     K,      4
        add             B,      B,      T1
 
-.LSGEMM_L4_END:
-.LSGEMM_L2_BEGIN:
+SGEMM_L4_END:
+SGEMM_L2_BEGIN:
+
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      1
+
+SGEMM_L2_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             SGEMM_L2_COPYB
 
        andi.           T1,     N,      2
-       ble             .LSGEMM_L2_END
+       ble             SGEMM_L2_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       1
        add             C,      C,      T1
        srawi.          I,      M,      4
-       ble             .LSGEMM_L2x16_END
+       ble             SGEMM_L2x16_END
 
-.LSGEMM_L2x16_BEGIN:
+SGEMM_L2x16_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L2x16_SUB0
+       ble             SGEMM_L2x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L2x16_SUB4
+       ble             SGEMM_L2x16_SUB4
 
-.LSGEMM_L2x16_LOOP_START:
+SGEMM_L2x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD2x16_1
@@ -1150,11 +1269,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L2x16_LOOP_END
+       ble             SGEMM_L2x16_LOOP_END
 
        .align 5
 
-.LSGEMM_L2x16_LOOP:
+SGEMM_L2x16_LOOP:
 
        KERNEL2x16_1
        dcbt            AO,     PRE
@@ -1171,9 +1290,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x16_LOOP
+       bgt             SGEMM_L2x16_LOOP
 
-.LSGEMM_L2x16_LOOP_END:
+SGEMM_L2x16_LOOP_END:
 
        KERNEL2x16_1
        dcbt            AO,     PRE
@@ -1188,9 +1307,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_1
        KERNEL2x16_E2
 
-       b               .LSGEMM_L2x16_SUB1
+       b               SGEMM_L2x16_SUB1
 
-.LSGEMM_L2x16_SUB4:
+SGEMM_L2x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL2x16_SUBI1
@@ -1204,53 +1323,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_SUB1
        KERNEL2x16_SUB1
 
-       b               .LSGEMM_L2x16_SUB1
+       b               SGEMM_L2x16_SUB1
 
-.LSGEMM_L2x16_SUB0:
+SGEMM_L2x16_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L2x16_SAVE
-       b               .LSGEMM_L2x16_SUB2
+       ble             SGEMM_L2x16_SAVE
+       b               SGEMM_L2x16_SUB2
 
-.LSGEMM_L2x16_SUB1:
+SGEMM_L2x16_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L2x16_SAVE
+       ble             SGEMM_L2x16_SAVE
 
-.LSGEMM_L2x16_SUB2:
+SGEMM_L2x16_SUB2:
 
        KERNEL2x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x16_SUB2
+       bgt             SGEMM_L2x16_SUB2
 
-.LSGEMM_L2x16_SAVE:
+SGEMM_L2x16_SAVE:
 
        SAVE2x16
 
        addic.          I,      I,      -1
-       bgt             .LSGEMM_L2x16_BEGIN
+       bgt             SGEMM_L2x16_BEGIN
 
-.LSGEMM_L2x16_END:
+SGEMM_L2x16_END:
 
-.LSGEMM_L2x8_BEGIN:
+SGEMM_L2x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LSGEMM_L2x1_END
+       ble             SGEMM_L2x1_END
 
        andi.           T1,     M,      8
-       ble             .LSGEMM_L2x8_END
-       mr              BO,     B
+       ble             SGEMM_L2x8_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L2x8_SUB0
+       ble             SGEMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L2x8_SUB4
+       ble             SGEMM_L2x8_SUB4
 
-.LSGEMM_L2x8_LOOP_START:
+SGEMM_L2x8_LOOP_START:
 
        LOAD2x8_1
        KERNEL2x8_I1
@@ -1264,11 +1383,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L2x8_LOOP_END
+       ble             SGEMM_L2x8_LOOP_END
 
        .align 5
 
-.LSGEMM_L2x8_LOOP:
+SGEMM_L2x8_LOOP:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -1281,9 +1400,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x8_LOOP
+       bgt             SGEMM_L2x8_LOOP
 
-.LSGEMM_L2x8_LOOP_END:
+SGEMM_L2x8_LOOP_END:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -1295,9 +1414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_1
        KERNEL2x8_E2
 
-       b               .LSGEMM_L2x8_SUB1
+       b               SGEMM_L2x8_SUB1
 
-.LSGEMM_L2x8_SUB4:
+SGEMM_L2x8_SUB4:
 
        KERNEL2x8_SUBI1
        KERNEL2x8_SUB1
@@ -1309,48 +1428,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_SUB1
        KERNEL2x8_SUB1
 
-       b               .LSGEMM_L2x8_SUB1
+       b               SGEMM_L2x8_SUB1
 
-.LSGEMM_L2x8_SUB0:
+SGEMM_L2x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L2x8_SAVE
-       b               .LSGEMM_L2x8_SUB2
+       ble             SGEMM_L2x8_SAVE
+       b               SGEMM_L2x8_SUB2
 
-.LSGEMM_L2x8_SUB1:
+SGEMM_L2x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L2x8_SAVE
+       ble             SGEMM_L2x8_SAVE
 
-.LSGEMM_L2x8_SUB2:
+SGEMM_L2x8_SUB2:
 
        KERNEL2x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x8_SUB2
+       bgt             SGEMM_L2x8_SUB2
 
-.LSGEMM_L2x8_SAVE:
+SGEMM_L2x8_SAVE:
 
        SAVE2x8
 
-.LSGEMM_L2x8_END:
+SGEMM_L2x8_END:
 
-.LSGEMM_L2x4_BEGIN:
+SGEMM_L2x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LSGEMM_L2x4_END
-       mr              BO,     B
+       ble             SGEMM_L2x4_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L2x4_SUB0
+       ble             SGEMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L2x4_SUB4
+       ble             SGEMM_L2x4_SUB4
 
-.LSGEMM_L2x4_LOOP_START:
+SGEMM_L2x4_LOOP_START:
 
        LOAD2x4_1
        KERNEL2x4_I1
@@ -1364,11 +1483,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L2x4_LOOP_END
+       ble             SGEMM_L2x4_LOOP_END
 
        .align 5
 
-.LSGEMM_L2x4_LOOP:
+SGEMM_L2x4_LOOP:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -1381,9 +1500,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x4_LOOP
+       bgt             SGEMM_L2x4_LOOP
 
-.LSGEMM_L2x4_LOOP_END:
+SGEMM_L2x4_LOOP_END:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -1395,9 +1514,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_1
        KERNEL2x4_E2
 
-       b               .LSGEMM_L2x4_SUB1
+       b               SGEMM_L2x4_SUB1
 
-.LSGEMM_L2x4_SUB4:
+SGEMM_L2x4_SUB4:
 
        KERNEL2x4_SUBI1
        KERNEL2x4_SUB1
@@ -1409,48 +1528,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
 
-       b               .LSGEMM_L2x4_SUB1
+       b               SGEMM_L2x4_SUB1
 
-.LSGEMM_L2x4_SUB0:
+SGEMM_L2x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L2x4_SAVE
-       b               .LSGEMM_L2x4_SUB2
+       ble             SGEMM_L2x4_SAVE
+       b               SGEMM_L2x4_SUB2
 
-.LSGEMM_L2x4_SUB1:
+SGEMM_L2x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L2x4_SAVE
+       ble             SGEMM_L2x4_SAVE
 
-.LSGEMM_L2x4_SUB2:
+SGEMM_L2x4_SUB2:
 
        KERNEL2x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x4_SUB2
+       bgt             SGEMM_L2x4_SUB2
 
-.LSGEMM_L2x4_SAVE:
+SGEMM_L2x4_SAVE:
 
        SAVE2x4
 
-.LSGEMM_L2x4_END:
+SGEMM_L2x4_END:
 
-.LSGEMM_L2x2_BEGIN:
+SGEMM_L2x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LSGEMM_L2x2_END
-       mr              BO,     B
+       ble             SGEMM_L2x2_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L2x2_SUB0
+       ble             SGEMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L2x2_SUB4
+       ble             SGEMM_L2x2_SUB4
 
-.LSGEMM_L2x2_LOOP_START:
+SGEMM_L2x2_LOOP_START:
 
        LOAD2x2_1
        KERNEL2x2_I1
@@ -1464,11 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L2x2_LOOP_END
+       ble             SGEMM_L2x2_LOOP_END
 
        .align 5
 
-.LSGEMM_L2x2_LOOP:
+SGEMM_L2x2_LOOP:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -1481,9 +1600,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x2_LOOP
+       bgt             SGEMM_L2x2_LOOP
 
-.LSGEMM_L2x2_LOOP_END:
+SGEMM_L2x2_LOOP_END:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -1495,9 +1614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_1
        KERNEL2x2_E2
 
-       b               .LSGEMM_L2x2_SUB1
+       b               SGEMM_L2x2_SUB1
 
-.LSGEMM_L2x2_SUB4:
+SGEMM_L2x2_SUB4:
 
        KERNEL2x2_SUBI1
        KERNEL2x2_SUB1
@@ -1509,48 +1628,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
 
-       b               .LSGEMM_L2x2_SUB1
+       b               SGEMM_L2x2_SUB1
 
-.LSGEMM_L2x2_SUB0:
+SGEMM_L2x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L2x2_SAVE
-       b               .LSGEMM_L2x2_SUB2
+       ble             SGEMM_L2x2_SAVE
+       b               SGEMM_L2x2_SUB2
 
-.LSGEMM_L2x2_SUB1:
+SGEMM_L2x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L2x2_SAVE
+       ble             SGEMM_L2x2_SAVE
 
-.LSGEMM_L2x2_SUB2:
+SGEMM_L2x2_SUB2:
 
        KERNEL2x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x2_SUB2
+       bgt             SGEMM_L2x2_SUB2
 
-.LSGEMM_L2x2_SAVE:
+SGEMM_L2x2_SAVE:
 
        SAVE2x2
 
-.LSGEMM_L2x2_END:
+SGEMM_L2x2_END:
 
-.LSGEMM_L2x1_BEGIN:
+SGEMM_L2x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LSGEMM_L2x1_END
-       mr              BO,     B
+       ble             SGEMM_L2x1_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L2x1_SUB0
+       ble             SGEMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L2x1_SUB4
+       ble             SGEMM_L2x1_SUB4
 
-.LSGEMM_L2x1_LOOP_START:
+SGEMM_L2x1_LOOP_START:
 
        LOAD2x1_1
        KERNEL2x1_I1
@@ -1564,11 +1683,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L2x1_LOOP_END
+       ble             SGEMM_L2x1_LOOP_END
 
        .align 5
 
-.LSGEMM_L2x1_LOOP:
+SGEMM_L2x1_LOOP:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -1581,9 +1700,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x1_LOOP
+       bgt             SGEMM_L2x1_LOOP
 
-.LSGEMM_L2x1_LOOP_END:
+SGEMM_L2x1_LOOP_END:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -1595,9 +1714,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_1
        KERNEL2x1_E2
 
-       b               .LSGEMM_L2x1_SUB1
+       b               SGEMM_L2x1_SUB1
 
-.LSGEMM_L2x1_SUB4:
+SGEMM_L2x1_SUB4:
 
        KERNEL2x1_SUBI1
        KERNEL2x1_SUB1
@@ -1609,59 +1728,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
 
-       b               .LSGEMM_L2x1_SUB1
+       b               SGEMM_L2x1_SUB1
 
-.LSGEMM_L2x1_SUB0:
+SGEMM_L2x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L2x1_SAVE
-       b               .LSGEMM_L2x1_SUB2
+       ble             SGEMM_L2x1_SAVE
+       b               SGEMM_L2x1_SUB2
 
-.LSGEMM_L2x1_SUB1:
+SGEMM_L2x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L2x1_SAVE
+       ble             SGEMM_L2x1_SAVE
 
-.LSGEMM_L2x1_SUB2:
+SGEMM_L2x1_SUB2:
 
        KERNEL2x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L2x1_SUB2
+       bgt             SGEMM_L2x1_SUB2
 
-.LSGEMM_L2x1_SAVE:
+SGEMM_L2x1_SAVE:
 
        SAVE2x1
 
-.LSGEMM_L2x1_END:
+SGEMM_L2x1_END:
 
        slwi            T1,     K,      3
        add             B,      B,      T1
 
-.LSGEMM_L2_END:
-.LSGEMM_L1_BEGIN:
+SGEMM_L2_END:
+SGEMM_L1_BEGIN:
+
+       mr              BO,     B
+       mr              BBO,    BBUFFER
+       slwi            T1,     K,      0
+
+SGEMM_L1_COPYB:
+       dcbtst          BBO,    PRE
+
+       lxvw4x          vs3,    o0,     BO
+       lxvw4x          vs11,   o16,    BO
+       xxspltw         vs4,    vs3,    0
+       xxspltw         vs5,    vs3,    1
+       xxspltw         vs6,    vs3,    2
+       xxspltw         vs7,    vs3,    3
+       xxspltw         vs12,   vs11,   0
+       xxspltw         vs13,   vs11,   1
+       xxspltw         vs14,   vs11,   2
+       xxspltw         vs15,   vs11,   3
+       stxvw4x         vs4,    o0,     BBO
+       stxvw4x         vs5,    o16,    BBO
+       stxvw4x         vs6,    o32,    BBO
+       stxvw4x         vs7,    o48,    BBO
+       addi            BO,     BO,     32
+       addi            BBO,    BBO,    64
+       stxvw4x         vs12,   o0,     BBO
+       stxvw4x         vs13,   o16,    BBO
+       stxvw4x         vs14,   o32,    BBO
+       stxvw4x         vs15,   o48,    BBO
+       addic.          T1,     T1,     -8
+       addi            BBO,    BBO,    64
+
+       bge             SGEMM_L1_COPYB
 
        andi.           T1,     N,      1
-       ble             .LSGEMM_L1_END
+       ble             SGEMM_L1_END
        mr              CO,     C
        mr              AO,     A
        srawi.          I,      M,      4
-       ble             .LSGEMM_L1x16_END
+       ble             SGEMM_L1x16_END
 
-.LSGEMM_L1x16_BEGIN:
+SGEMM_L1x16_BEGIN:
 
 
-       mr              BO,     B
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L1x16_SUB0
+       ble             SGEMM_L1x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L1x16_SUB4
+       ble             SGEMM_L1x16_SUB4
 
-.LSGEMM_L1x16_LOOP_START:
+SGEMM_L1x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD1x16_1
@@ -1680,11 +1831,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L1x16_LOOP_END
+       ble             SGEMM_L1x16_LOOP_END
 
        .align 5
 
-.LSGEMM_L1x16_LOOP:
+SGEMM_L1x16_LOOP:
 
        KERNEL1x16_1
        dcbt            AO,     PRE
@@ -1701,9 +1852,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x16_LOOP
+       bgt             SGEMM_L1x16_LOOP
 
-.LSGEMM_L1x16_LOOP_END:
+SGEMM_L1x16_LOOP_END:
 
        KERNEL1x16_1
        dcbt            AO,     PRE
@@ -1718,9 +1869,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_1
        KERNEL1x16_E2
 
-       b               .LSGEMM_L1x16_SUB1
+       b               SGEMM_L1x16_SUB1
 
-.LSGEMM_L1x16_SUB4:
+SGEMM_L1x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL1x16_SUBI1
@@ -1734,53 +1885,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_SUB1
        KERNEL1x16_SUB1
 
-       b               .LSGEMM_L1x16_SUB1
+       b               SGEMM_L1x16_SUB1
 
-.LSGEMM_L1x16_SUB0:
+SGEMM_L1x16_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L1x16_SAVE
-       b               .LSGEMM_L1x16_SUB2
+       ble             SGEMM_L1x16_SAVE
+       b               SGEMM_L1x16_SUB2
 
-.LSGEMM_L1x16_SUB1:
+SGEMM_L1x16_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L1x16_SAVE
+       ble             SGEMM_L1x16_SAVE
 
-.LSGEMM_L1x16_SUB2:
+SGEMM_L1x16_SUB2:
 
        KERNEL1x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x16_SUB2
+       bgt             SGEMM_L1x16_SUB2
 
-.LSGEMM_L1x16_SAVE:
+SGEMM_L1x16_SAVE:
 
        SAVE1x16
 
        addic.          I,      I,      -1
-       bgt             .LSGEMM_L1x16_BEGIN
+       bgt             SGEMM_L1x16_BEGIN
 
-.LSGEMM_L1x16_END:
+SGEMM_L1x16_END:
 
-.LSGEMM_L1x8_BEGIN:
+SGEMM_L1x8_BEGIN:
 
        andi.           T2,     M,      15
-       ble             .LSGEMM_L1x1_END
+       ble             SGEMM_L1x1_END
 
        andi.           T1,     M,      8
-       ble             .LSGEMM_L1x8_END
-       mr              BO,     B
+       ble             SGEMM_L1x8_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L1x8_SUB0
+       ble             SGEMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L1x8_SUB4
+       ble             SGEMM_L1x8_SUB4
 
-.LSGEMM_L1x8_LOOP_START:
+SGEMM_L1x8_LOOP_START:
 
        LOAD1x8_1
        KERNEL1x8_I1
@@ -1794,11 +1945,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L1x8_LOOP_END
+       ble             SGEMM_L1x8_LOOP_END
 
        .align 5
 
-.LSGEMM_L1x8_LOOP:
+SGEMM_L1x8_LOOP:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -1811,9 +1962,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x8_LOOP
+       bgt             SGEMM_L1x8_LOOP
 
-.LSGEMM_L1x8_LOOP_END:
+SGEMM_L1x8_LOOP_END:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -1825,9 +1976,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_1
        KERNEL1x8_E2
 
-       b               .LSGEMM_L1x8_SUB1
+       b               SGEMM_L1x8_SUB1
 
-.LSGEMM_L1x8_SUB4:
+SGEMM_L1x8_SUB4:
 
        KERNEL1x8_SUBI1
        KERNEL1x8_SUB1
@@ -1839,48 +1990,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_SUB1
        KERNEL1x8_SUB1
 
-       b               .LSGEMM_L1x8_SUB1
+       b               SGEMM_L1x8_SUB1
 
-.LSGEMM_L1x8_SUB0:
+SGEMM_L1x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L1x8_SAVE
-       b               .LSGEMM_L1x8_SUB2
+       ble             SGEMM_L1x8_SAVE
+       b               SGEMM_L1x8_SUB2
 
-.LSGEMM_L1x8_SUB1:
+SGEMM_L1x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L1x8_SAVE
+       ble             SGEMM_L1x8_SAVE
 
-.LSGEMM_L1x8_SUB2:
+SGEMM_L1x8_SUB2:
 
        KERNEL1x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x8_SUB2
+       bgt             SGEMM_L1x8_SUB2
 
-.LSGEMM_L1x8_SAVE:
+SGEMM_L1x8_SAVE:
 
        SAVE1x8
 
-.LSGEMM_L1x8_END:
+SGEMM_L1x8_END:
 
-.LSGEMM_L1x4_BEGIN:
+SGEMM_L1x4_BEGIN:
 
 
        andi.           T1,     M,      4
-       ble             .LSGEMM_L1x4_END
-       mr              BO,     B
+       ble             SGEMM_L1x4_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L1x4_SUB0
+       ble             SGEMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L1x4_SUB4
+       ble             SGEMM_L1x4_SUB4
 
-.LSGEMM_L1x4_LOOP_START:
+SGEMM_L1x4_LOOP_START:
 
        LOAD1x4_1
        KERNEL1x4_I1
@@ -1894,11 +2045,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L1x4_LOOP_END
+       ble             SGEMM_L1x4_LOOP_END
 
        .align 5
 
-.LSGEMM_L1x4_LOOP:
+SGEMM_L1x4_LOOP:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1911,9 +2062,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x4_LOOP
+       bgt             SGEMM_L1x4_LOOP
 
-.LSGEMM_L1x4_LOOP_END:
+SGEMM_L1x4_LOOP_END:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1925,9 +2076,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_1
        KERNEL1x4_E2
 
-       b               .LSGEMM_L1x4_SUB1
+       b               SGEMM_L1x4_SUB1
 
-.LSGEMM_L1x4_SUB4:
+SGEMM_L1x4_SUB4:
 
        KERNEL1x4_SUBI1
        KERNEL1x4_SUB1
@@ -1939,48 +2090,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
 
-       b               .LSGEMM_L1x4_SUB1
+       b               SGEMM_L1x4_SUB1
 
-.LSGEMM_L1x4_SUB0:
+SGEMM_L1x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L1x4_SAVE
-       b               .LSGEMM_L1x4_SUB2
+       ble             SGEMM_L1x4_SAVE
+       b               SGEMM_L1x4_SUB2
 
-.LSGEMM_L1x4_SUB1:
+SGEMM_L1x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L1x4_SAVE
+       ble             SGEMM_L1x4_SAVE
 
-.LSGEMM_L1x4_SUB2:
+SGEMM_L1x4_SUB2:
 
        KERNEL1x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x4_SUB2
+       bgt             SGEMM_L1x4_SUB2
 
-.LSGEMM_L1x4_SAVE:
+SGEMM_L1x4_SAVE:
 
        SAVE1x4
 
-.LSGEMM_L1x4_END:
+SGEMM_L1x4_END:
 
-.LSGEMM_L1x2_BEGIN:
+SGEMM_L1x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LSGEMM_L1x2_END
-       mr              BO,     B
+       ble             SGEMM_L1x2_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L1x2_SUB0
+       ble             SGEMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L1x2_SUB4
+       ble             SGEMM_L1x2_SUB4
 
-.LSGEMM_L1x2_LOOP_START:
+SGEMM_L1x2_LOOP_START:
 
        LOAD1x2_1
        KERNEL1x2_I1
@@ -1994,11 +2145,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L1x2_LOOP_END
+       ble             SGEMM_L1x2_LOOP_END
 
        .align 5
 
-.LSGEMM_L1x2_LOOP:
+SGEMM_L1x2_LOOP:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -2011,9 +2162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x2_LOOP
+       bgt             SGEMM_L1x2_LOOP
 
-.LSGEMM_L1x2_LOOP_END:
+SGEMM_L1x2_LOOP_END:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -2025,9 +2176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_1
        KERNEL1x2_E2
 
-       b               .LSGEMM_L1x2_SUB1
+       b               SGEMM_L1x2_SUB1
 
-.LSGEMM_L1x2_SUB4:
+SGEMM_L1x2_SUB4:
 
        KERNEL1x2_SUBI1
        KERNEL1x2_SUB1
@@ -2039,48 +2190,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
 
-       b               .LSGEMM_L1x2_SUB1
+       b               SGEMM_L1x2_SUB1
 
-.LSGEMM_L1x2_SUB0:
+SGEMM_L1x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L1x2_SAVE
-       b               .LSGEMM_L1x2_SUB2
+       ble             SGEMM_L1x2_SAVE
+       b               SGEMM_L1x2_SUB2
 
-.LSGEMM_L1x2_SUB1:
+SGEMM_L1x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L1x2_SAVE
+       ble             SGEMM_L1x2_SAVE
 
-.LSGEMM_L1x2_SUB2:
+SGEMM_L1x2_SUB2:
 
        KERNEL1x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x2_SUB2
+       bgt             SGEMM_L1x2_SUB2
 
-.LSGEMM_L1x2_SAVE:
+SGEMM_L1x2_SAVE:
 
        SAVE1x2
 
-.LSGEMM_L1x2_END:
+SGEMM_L1x2_END:
 
-.LSGEMM_L1x1_BEGIN:
+SGEMM_L1x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LSGEMM_L1x1_END
-       mr              BO,     B
+       ble             SGEMM_L1x1_END
+       mr              BO,     BBUFFER
        srawi.          L,      K,      3
-       ble             .LSGEMM_L1x1_SUB0
+       ble             SGEMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSGEMM_L1x1_SUB4
+       ble             SGEMM_L1x1_SUB4
 
-.LSGEMM_L1x1_LOOP_START:
+SGEMM_L1x1_LOOP_START:
 
        LOAD1x1_1
        KERNEL1x1_I1
@@ -2094,11 +2245,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -2
-       ble             .LSGEMM_L1x1_LOOP_END
+       ble             SGEMM_L1x1_LOOP_END
 
        .align 5
 
-.LSGEMM_L1x1_LOOP:
+SGEMM_L1x1_LOOP:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -2111,9 +2262,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x1_LOOP
+       bgt             SGEMM_L1x1_LOOP
 
-.LSGEMM_L1x1_LOOP_END:
+SGEMM_L1x1_LOOP_END:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -2125,9 +2276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_1
        KERNEL1x1_E2
 
-       b               .LSGEMM_L1x1_SUB1
+       b               SGEMM_L1x1_SUB1
 
-.LSGEMM_L1x1_SUB4:
+SGEMM_L1x1_SUB4:
 
        KERNEL1x1_SUBI1
        KERNEL1x1_SUB1
@@ -2139,34 +2290,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
 
-       b               .LSGEMM_L1x1_SUB1
+       b               SGEMM_L1x1_SUB1
 
-.LSGEMM_L1x1_SUB0:
+SGEMM_L1x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSGEMM_L1x1_SAVE
-       b               .LSGEMM_L1x1_SUB2
+       ble             SGEMM_L1x1_SAVE
+       b               SGEMM_L1x1_SUB2
 
-.LSGEMM_L1x1_SUB1:
+SGEMM_L1x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LSGEMM_L1x1_SAVE
+       ble             SGEMM_L1x1_SAVE
 
-.LSGEMM_L1x1_SUB2:
+SGEMM_L1x1_SUB2:
 
        KERNEL1x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSGEMM_L1x1_SUB2
+       bgt             SGEMM_L1x1_SUB2
 
-.LSGEMM_L1x1_SAVE:
+SGEMM_L1x1_SAVE:
 
        SAVE1x1
 
-.LSGEMM_L1x1_END:
+SGEMM_L1x1_END:
 
-.LSGEMM_L1_END:
+SGEMM_L1_END:
index a2d36c0..71dc529 100644 (file)
@@ -26,13 +26,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
+
 /**********************************************************************************************
 * Macros for N=8 and M=16
 **********************************************************************************************/
@@ -46,21 +47,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 .endm
 
@@ -74,21 +75,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs20,   vs29,   0
-       xxspltw         vs21,   vs29,   1
-       xxspltw         vs22,   vs29,   2
-       xxspltw         vs23,   vs29,   3
+       lxvw4x          vs20,   o0,     T1
+       lxvw4x          vs21,   o16,    T1
+       lxvw4x          vs22,   o32,    T1
+       lxvw4x          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -136,42 +137,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL8x16_1
 
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       mr              T1,     BO
+
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       lxvw4x          vs20,   o0,     T1
+       lxvw4x          vs21,   o16,    T1
+       lxvw4x          vs22,   o32,    T1
+       lxvw4x          vs23,   o48,    T1
+
+       addi            BO,     BO,     128
+
+
        xvmaddasp       vs32,   vs0,    vs8
        xvmaddasp       vs33,   vs1,    vs8
-       lxvw4x          vs28,   o0,     BO
-       lxvw4x          vs4,    o0,     AO
        xvmaddasp       vs34,   vs2,    vs8
        xvmaddasp       vs35,   vs3,    vs8
 
        xvmaddasp       vs36,   vs0,    vs9
        xvmaddasp       vs37,   vs1,    vs9
-       lxvw4x          vs29,   o16,    BO
-       lxvw4x          vs5,    o16,    AO
        xvmaddasp       vs38,   vs2,    vs9
        xvmaddasp       vs39,   vs3,    vs9
 
        xvmaddasp       vs40,   vs0,    vs10
        xvmaddasp       vs41,   vs1,    vs10
-       lxvw4x          vs6,    o32,    AO
-       lxvw4x          vs7,    o48,    AO
        xvmaddasp       vs42,   vs2,    vs10
        xvmaddasp       vs43,   vs3,    vs10
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
-
        xvmaddasp       vs44,   vs0,    vs11
        xvmaddasp       vs45,   vs1,    vs11
        xvmaddasp       vs46,   vs2,    vs11
        xvmaddasp       vs47,   vs3,    vs11
 
-       xxspltw         vs20,   vs29,   0
-       xxspltw         vs21,   vs29,   1
-       xxspltw         vs22,   vs29,   2
-       xxspltw         vs23,   vs29,   3
-
        xvmaddasp       vs48,   vs0,    vs12
        xvmaddasp       vs49,   vs1,    vs12
        xvmaddasp       vs50,   vs2,    vs12
@@ -184,8 +194,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        xvmaddasp       vs56,   vs0,    vs14
        xvmaddasp       vs57,   vs1,    vs14
-       addi            AO,     AO,     64
-       addi            BO,     BO,     32
        xvmaddasp       vs58,   vs2,    vs14
        xvmaddasp       vs59,   vs3,    vs14
 
@@ -199,47 +207,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL8x16_2
 
-       xvmaddasp       vs32,   vs4,    vs16
-       xvmaddasp       vs33,   vs5,    vs16
 
-       lxvw4x          vs28,   o0,     BO
        lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       mr              T1,     BO
+
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
+
+       addi            T1,     T1,     64
+
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
+       addi            BO,     BO,     128
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
        xvmaddasp       vs34,   vs6,    vs16
        xvmaddasp       vs35,   vs7,    vs16
 
        xvmaddasp       vs36,   vs4,    vs17
        xvmaddasp       vs37,   vs5,    vs17
-
-       lxvw4x          vs29,   o16,    BO
-       lxvw4x          vs1,    o16,    AO
-
        xvmaddasp       vs38,   vs6,    vs17
        xvmaddasp       vs39,   vs7,    vs17
 
-       lxvw4x          vs2,    o32,    AO
-       lxvw4x          vs3,    o48,    AO
-
        xvmaddasp       vs40,   vs4,    vs18
        xvmaddasp       vs41,   vs5,    vs18
        xvmaddasp       vs42,   vs6,    vs18
        xvmaddasp       vs43,   vs7,    vs18
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
-
        xvmaddasp       vs44,   vs4,    vs19
        xvmaddasp       vs45,   vs5,    vs19
        xvmaddasp       vs46,   vs6,    vs19
        xvmaddasp       vs47,   vs7,    vs19
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
-
        xvmaddasp       vs48,   vs4,    vs20
        xvmaddasp       vs49,   vs5,    vs20
        xvmaddasp       vs50,   vs6,    vs20
@@ -257,8 +269,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        xvmaddasp       vs60,   vs4,    vs23
        xvmaddasp       vs61,   vs5,    vs23
-       addi            AO,     AO,     64
-       addi            BO,     BO,     32
        xvmaddasp       vs62,   vs6,    vs23
        xvmaddasp       vs63,   vs7,    vs23
 
@@ -321,21 +331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -391,21 +401,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -464,106 +474,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
 #endif
 
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -581,106 +503,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs36,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
 #endif
 
-       stxvw4x         vs37,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs38,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs39,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -698,106 +532,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs40,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs41,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs42,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs43,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+       xvmulsp         vs2,    vs42,   alpha_vr
+       xvmulsp         vs3,    vs43,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+       xvmaddasp       vs2,    vs42,   alpha_vr
+       xvmaddasp       vs3,    vs43,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -815,106 +561,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs44,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs45,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs46,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs47,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+       xvmulsp         vs2,    vs46,   alpha_vr
+       xvmulsp         vs3,    vs47,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+       xvmaddasp       vs2,    vs46,   alpha_vr
+       xvmaddasp       vs3,    vs47,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -932,106 +590,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs48,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs48,   alpha_vr
+       xvmulsp         vs1,    vs49,   alpha_vr
+       xvmulsp         vs2,    vs50,   alpha_vr
+       xvmulsp         vs3,    vs51,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs48,   alpha_vr
+       xvmaddasp       vs1,    vs49,   alpha_vr
+       xvmaddasp       vs2,    vs50,   alpha_vr
+       xvmaddasp       vs3,    vs51,   alpha_vr
 #endif
 
-       stxvw4x         vs49,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs50,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs51,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -1049,106 +619,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs52,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs52,   alpha_vr
+       xvmulsp         vs1,    vs53,   alpha_vr
+       xvmulsp         vs2,    vs54,   alpha_vr
+       xvmulsp         vs3,    vs55,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs52,   alpha_vr
+       xvmaddasp       vs1,    vs53,   alpha_vr
+       xvmaddasp       vs2,    vs54,   alpha_vr
+       xvmaddasp       vs3,    vs55,   alpha_vr
 #endif
 
-       stxvw4x         vs53,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs54,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs55,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -1166,106 +648,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs56,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs57,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs58,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs59,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs56,   alpha_vr
+       xvmulsp         vs1,    vs57,   alpha_vr
+       xvmulsp         vs2,    vs58,   alpha_vr
+       xvmulsp         vs3,    vs59,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs56,   alpha_vr
+       xvmaddasp       vs1,    vs57,   alpha_vr
+       xvmaddasp       vs2,    vs58,   alpha_vr
+       xvmaddasp       vs3,    vs59,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -1283,106 +677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs60,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs61,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs62,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs63,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs60,   alpha_vr
+       xvmulsp         vs1,    vs61,   alpha_vr
+       xvmulsp         vs2,    vs62,   alpha_vr
+       xvmulsp         vs3,    vs63,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs60,   alpha_vr
+       xvmaddasp       vs1,    vs61,   alpha_vr
+       xvmaddasp       vs2,    vs62,   alpha_vr
+       xvmaddasp       vs3,    vs63,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -1406,21 +712,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 .endm
 
@@ -1432,21 +738,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs20,   vs29,   0
-       xxspltw         vs21,   vs29,   1
-       xxspltw         vs22,   vs29,   2
-       xxspltw         vs23,   vs29,   3
+       lxvw4x          vs20,   o0,     T1
+       lxvw4x          vs21,   o16,    T1
+       lxvw4x          vs22,   o32,    T1
+       lxvw4x          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -1484,21 +790,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs20,   vs29,   0
-       xxspltw         vs21,   vs29,   1
-       xxspltw         vs22,   vs29,   2
-       xxspltw         vs23,   vs29,   3
+       lxvw4x          vs20,   o0,     T1
+       lxvw4x          vs21,   o16,    T1
+       lxvw4x          vs22,   o32,    T1
+       lxvw4x          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -1536,21 +842,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -1618,21 +924,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -1670,21 +976,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -1725,58 +1031,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -1790,58 +1052,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
 #endif
 
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -1855,58 +1073,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs36,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
 #endif
 
-       stxvw4x         vs37,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -1920,58 +1094,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs38,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs39,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs38,   alpha_vr
+       xvmulsp         vs1,    vs39,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs38,   alpha_vr
+       xvmaddasp       vs1,    vs39,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -1985,58 +1115,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs40,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs41,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -2050,58 +1136,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs42,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs43,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs42,   alpha_vr
+       xvmulsp         vs1,    vs43,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs42,   alpha_vr
+       xvmaddasp       vs1,    vs43,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -2115,58 +1157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs44,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs45,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -2180,58 +1178,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs46,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs46,   alpha_vr
+       xvmulsp         vs1,    vs47,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs46,   alpha_vr
+       xvmaddasp       vs1,    vs47,   alpha_vr
 #endif
 
-       stxvw4x         vs47,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -2252,21 +1206,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 .endm
 
@@ -2277,21 +1231,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs20,   vs29,   0
-       xxspltw         vs21,   vs29,   1
-       xxspltw         vs22,   vs29,   2
-       xxspltw         vs23,   vs29,   3
+       lxvw4x          vs20,   o0,     T1
+       lxvw4x          vs21,   o16,    T1
+       lxvw4x          vs22,   o32,    T1
+       lxvw4x          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -2320,21 +1274,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs20,   vs29,   0
-       xxspltw         vs21,   vs29,   1
-       xxspltw         vs22,   vs29,   2
-       xxspltw         vs23,   vs29,   3
+       lxvw4x          vs20,   o0,     T1
+       lxvw4x          vs21,   o16,    T1
+       lxvw4x          vs22,   o32,    T1
+       lxvw4x          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -2363,21 +1317,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -2428,21 +1382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -2471,21 +1425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       lxvw4x          vs29,   o16,    BO
+       addi            T1,     T1,     64
 
-       xxspltw         vs12,   vs29,   0
-       xxspltw         vs13,   vs29,   1
-       xxspltw         vs14,   vs29,   2
-       xxspltw         vs15,   vs29,   3
+       lxvw4x          vs12,   o0,     T1
+       lxvw4x          vs13,   o16,    T1
+       lxvw4x          vs14,   o32,    T1
+       lxvw4x          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+       addi            BO,     BO,     128
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -2517,34 +1471,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2556,34 +1488,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2595,34 +1505,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs34,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs34,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2634,34 +1522,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs35,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2673,33 +1539,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs36,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-
-
+       xvmulsp         vs0,    vs36,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+#endif
 
        stxvw4x         vs0,    o0,     T1
 
@@ -2712,34 +1556,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs37,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs37,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs37,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2751,34 +1573,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs38,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs38,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs38,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2790,34 +1590,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs39,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs39,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs39,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -2841,18 +1619,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 .endm
 
@@ -2867,43 +1646,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs20,   o0,     T1
-       lxsspx          vs21,   o4,     T1
-       lxsspx          vs22,   o8,     T1
-       lxsspx          vs23,   o12,    T1
+       lxsspx          vs21,   o16,    T1
+       lxsspx          vs22,   o32,    T1
+       lxsspx          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
-       xsmulsp         vs34,   vs0,    vs9
-       xsmulsp         vs35,   vs1,    vs9
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
 
-       xsmulsp         vs36,   vs0,    vs10
-       xsmulsp         vs37,   vs1,    vs10
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
 
-       xsmulsp         vs38,   vs0,    vs11
-       xsmulsp         vs39,   vs1,    vs11
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
 
-       xsmulsp         vs40,   vs0,    vs12
-       xsmulsp         vs41,   vs1,    vs12
+       xsmuldp         vs40,   vs0,    vs12
+       xsmuldp         vs41,   vs1,    vs12
 
-       xsmulsp         vs42,   vs0,    vs13
-       xsmulsp         vs43,   vs1,    vs13
+       xsmuldp         vs42,   vs0,    vs13
+       xsmuldp         vs43,   vs1,    vs13
 
-       xsmulsp         vs44,   vs0,    vs14
-       xsmulsp         vs45,   vs1,    vs14
+       xsmuldp         vs44,   vs0,    vs14
+       xsmuldp         vs45,   vs1,    vs14
 
-       xsmulsp         vs46,   vs0,    vs15
-       xsmulsp         vs47,   vs1,    vs15
+       xsmuldp         vs46,   vs0,    vs15
+       xsmuldp         vs47,   vs1,    vs15
 
 
 .endm
@@ -2919,43 +1699,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs20,   o0,     T1
-       lxsspx          vs21,   o4,     T1
-       lxsspx          vs22,   o8,     T1
-       lxsspx          vs23,   o12,    T1
+       lxsspx          vs21,   o16,    T1
+       lxsspx          vs22,   o32,    T1
+       lxsspx          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
-       xsmaddasp       vs34,   vs0,    vs9
-       xsmaddasp       vs35,   vs1,    vs9
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
 
-       xsmaddasp       vs36,   vs0,    vs10
-       xsmaddasp       vs37,   vs1,    vs10
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
 
-       xsmaddasp       vs38,   vs0,    vs11
-       xsmaddasp       vs39,   vs1,    vs11
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
 
-       xsmaddasp       vs40,   vs0,    vs12
-       xsmaddasp       vs41,   vs1,    vs12
+       xsmaddadp       vs40,   vs0,    vs12
+       xsmaddadp       vs41,   vs1,    vs12
 
-       xsmaddasp       vs42,   vs0,    vs13
-       xsmaddasp       vs43,   vs1,    vs13
+       xsmaddadp       vs42,   vs0,    vs13
+       xsmaddadp       vs43,   vs1,    vs13
 
-       xsmaddasp       vs44,   vs0,    vs14
-       xsmaddasp       vs45,   vs1,    vs14
+       xsmaddadp       vs44,   vs0,    vs14
+       xsmaddadp       vs45,   vs1,    vs14
 
-       xsmaddasp       vs46,   vs0,    vs15
-       xsmaddasp       vs47,   vs1,    vs15
+       xsmaddadp       vs46,   vs0,    vs15
+       xsmaddadp       vs47,   vs1,    vs15
 
 
 .endm
@@ -2971,43 +1752,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
-       xsmaddasp       vs34,   vs4,    vs17
-       xsmaddasp       vs35,   vs5,    vs17
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
 
-       xsmaddasp       vs36,   vs4,    vs18
-       xsmaddasp       vs37,   vs5,    vs18
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
 
-       xsmaddasp       vs38,   vs4,    vs19
-       xsmaddasp       vs39,   vs5,    vs19
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
 
-       xsmaddasp       vs40,   vs4,    vs20
-       xsmaddasp       vs41,   vs5,    vs20
+       xsmaddadp       vs40,   vs4,    vs20
+       xsmaddadp       vs41,   vs5,    vs20
 
-       xsmaddasp       vs42,   vs4,    vs21
-       xsmaddasp       vs43,   vs5,    vs21
+       xsmaddadp       vs42,   vs4,    vs21
+       xsmaddadp       vs43,   vs5,    vs21
 
-       xsmaddasp       vs44,   vs4,    vs22
-       xsmaddasp       vs45,   vs5,    vs22
+       xsmaddadp       vs44,   vs4,    vs22
+       xsmaddadp       vs45,   vs5,    vs22
 
-       xsmaddasp       vs46,   vs4,    vs23
-       xsmaddasp       vs47,   vs5,    vs23
+       xsmaddadp       vs46,   vs4,    vs23
+       xsmaddadp       vs47,   vs5,    vs23
 
 
 .endm
@@ -3015,29 +1797,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL8x2_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
-       xsmaddasp       vs34,   vs4,    vs17
-       xsmaddasp       vs35,   vs5,    vs17
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
 
-       xsmaddasp       vs36,   vs4,    vs18
-       xsmaddasp       vs37,   vs5,    vs18
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
 
-       xsmaddasp       vs38,   vs4,    vs19
-       xsmaddasp       vs39,   vs5,    vs19
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
 
-       xsmaddasp       vs40,   vs4,    vs20
-       xsmaddasp       vs41,   vs5,    vs20
+       xsmaddadp       vs40,   vs4,    vs20
+       xsmaddadp       vs41,   vs5,    vs20
 
-       xsmaddasp       vs42,   vs4,    vs21
-       xsmaddasp       vs43,   vs5,    vs21
+       xsmaddadp       vs42,   vs4,    vs21
+       xsmaddadp       vs43,   vs5,    vs21
 
-       xsmaddasp       vs44,   vs4,    vs22
-       xsmaddasp       vs45,   vs5,    vs22
+       xsmaddadp       vs44,   vs4,    vs22
+       xsmaddadp       vs45,   vs5,    vs22
 
-       xsmaddasp       vs46,   vs4,    vs23
-       xsmaddasp       vs47,   vs5,    vs23
+       xsmaddadp       vs46,   vs4,    vs23
+       xsmaddadp       vs47,   vs5,    vs23
 
 
 .endm
@@ -3053,43 +1835,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
-       xsmulsp         vs34,   vs0,    vs9
-       xsmulsp         vs35,   vs1,    vs9
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
 
-       xsmulsp         vs36,   vs0,    vs10
-       xsmulsp         vs37,   vs1,    vs10
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
 
-       xsmulsp         vs38,   vs0,    vs11
-       xsmulsp         vs39,   vs1,    vs11
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
 
-       xsmulsp         vs40,   vs0,    vs12
-       xsmulsp         vs41,   vs1,    vs12
+       xsmuldp         vs40,   vs0,    vs12
+       xsmuldp         vs41,   vs1,    vs12
 
-       xsmulsp         vs42,   vs0,    vs13
-       xsmulsp         vs43,   vs1,    vs13
+       xsmuldp         vs42,   vs0,    vs13
+       xsmuldp         vs43,   vs1,    vs13
 
-       xsmulsp         vs44,   vs0,    vs14
-       xsmulsp         vs45,   vs1,    vs14
+       xsmuldp         vs44,   vs0,    vs14
+       xsmuldp         vs45,   vs1,    vs14
 
-       xsmulsp         vs46,   vs0,    vs15
-       xsmulsp         vs47,   vs1,    vs15
+       xsmuldp         vs46,   vs0,    vs15
+       xsmuldp         vs47,   vs1,    vs15
 
 
 .endm
@@ -3105,43 +1888,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
-       xsmaddasp       vs34,   vs0,    vs9
-       xsmaddasp       vs35,   vs1,    vs9
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
 
-       xsmaddasp       vs36,   vs0,    vs10
-       xsmaddasp       vs37,   vs1,    vs10
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
 
-       xsmaddasp       vs38,   vs0,    vs11
-       xsmaddasp       vs39,   vs1,    vs11
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
 
-       xsmaddasp       vs40,   vs0,    vs12
-       xsmaddasp       vs41,   vs1,    vs12
+       xsmaddadp       vs40,   vs0,    vs12
+       xsmaddadp       vs41,   vs1,    vs12
 
-       xsmaddasp       vs42,   vs0,    vs13
-       xsmaddasp       vs43,   vs1,    vs13
+       xsmaddadp       vs42,   vs0,    vs13
+       xsmaddadp       vs43,   vs1,    vs13
 
-       xsmaddasp       vs44,   vs0,    vs14
-       xsmaddasp       vs45,   vs1,    vs14
+       xsmaddadp       vs44,   vs0,    vs14
+       xsmaddadp       vs45,   vs1,    vs14
 
-       xsmaddasp       vs46,   vs0,    vs15
-       xsmaddasp       vs47,   vs1,    vs15
+       xsmaddadp       vs46,   vs0,    vs15
+       xsmaddadp       vs47,   vs1,    vs15
 
 
 .endm
@@ -3158,17 +1942,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-       xsmulsp         vs1,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3185,17 +1963,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs34,   alpha_r
-       xsmulsp         vs1,    vs35,   alpha_r
-
+       xsmuldp         vs0,    vs34,   alpha_r
+       xsmuldp         vs1,    vs35,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs34,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs35,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs34,   alpha_r
+       xsmaddadp       vs1,    vs35,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3212,17 +1984,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs36,   alpha_r
-       xsmulsp         vs1,    vs37,   alpha_r
-
+       xsmuldp         vs0,    vs36,   alpha_r
+       xsmuldp         vs1,    vs37,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs36,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs37,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs36,   alpha_r
+       xsmaddadp       vs1,    vs37,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3239,17 +2005,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs38,   alpha_r
-       xsmulsp         vs1,    vs39,   alpha_r
-
+       xsmuldp         vs0,    vs38,   alpha_r
+       xsmuldp         vs1,    vs39,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs38,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs39,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs38,   alpha_r
+       xsmaddadp       vs1,    vs39,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3266,17 +2026,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs40,   alpha_r
-       xsmulsp         vs1,    vs41,   alpha_r
-
+       xsmuldp         vs0,    vs40,   alpha_r
+       xsmuldp         vs1,    vs41,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs40,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs41,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs40,   alpha_r
+       xsmaddadp       vs1,    vs41,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3293,17 +2047,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs42,   alpha_r
-       xsmulsp         vs1,    vs43,   alpha_r
-
+       xsmuldp         vs0,    vs42,   alpha_r
+       xsmuldp         vs1,    vs43,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs42,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs43,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs42,   alpha_r
+       xsmaddadp       vs1,    vs43,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3320,17 +2068,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs44,   alpha_r
-       xsmulsp         vs1,    vs45,   alpha_r
-
+       xsmuldp         vs0,    vs44,   alpha_r
+       xsmuldp         vs1,    vs45,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs44,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs45,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs44,   alpha_r
+       xsmaddadp       vs1,    vs45,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3347,17 +2089,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs46,   alpha_r
-       xsmulsp         vs1,    vs47,   alpha_r
-
+       xsmuldp         vs0,    vs46,   alpha_r
+       xsmuldp         vs1,    vs47,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs46,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs47,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs46,   alpha_r
+       xsmaddadp       vs1,    vs47,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3383,18 +2119,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 .endm
 
@@ -3408,35 +2145,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs20,   o0,     T1
-       lxsspx          vs21,   o4,     T1
-       lxsspx          vs22,   o8,     T1
-       lxsspx          vs23,   o12,    T1
+       lxsspx          vs21,   o16,    T1
+       lxsspx          vs22,   o32,    T1
+       lxsspx          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
-       xsmulsp         vs33,   vs0,    vs9
+       xsmuldp         vs33,   vs0,    vs9
 
-       xsmulsp         vs34,   vs0,    vs10
+       xsmuldp         vs34,   vs0,    vs10
 
-       xsmulsp         vs35,   vs0,    vs11
+       xsmuldp         vs35,   vs0,    vs11
 
-       xsmulsp         vs36,   vs0,    vs12
+       xsmuldp         vs36,   vs0,    vs12
 
-       xsmulsp         vs37,   vs0,    vs13
+       xsmuldp         vs37,   vs0,    vs13
 
-       xsmulsp         vs38,   vs0,    vs14
+       xsmuldp         vs38,   vs0,    vs14
 
-       xsmulsp         vs39,   vs0,    vs15
+       xsmuldp         vs39,   vs0,    vs15
 
 
 .endm
@@ -3451,35 +2189,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs20,   o0,     T1
-       lxsspx          vs21,   o4,     T1
-       lxsspx          vs22,   o8,     T1
-       lxsspx          vs23,   o12,    T1
+       lxsspx          vs21,   o16,    T1
+       lxsspx          vs22,   o32,    T1
+       lxsspx          vs23,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
-       xsmaddasp       vs33,   vs0,    vs9
+       xsmaddadp       vs33,   vs0,    vs9
 
-       xsmaddasp       vs34,   vs0,    vs10
+       xsmaddadp       vs34,   vs0,    vs10
 
-       xsmaddasp       vs35,   vs0,    vs11
+       xsmaddadp       vs35,   vs0,    vs11
 
-       xsmaddasp       vs36,   vs0,    vs12
+       xsmaddadp       vs36,   vs0,    vs12
 
-       xsmaddasp       vs37,   vs0,    vs13
+       xsmaddadp       vs37,   vs0,    vs13
 
-       xsmaddasp       vs38,   vs0,    vs14
+       xsmaddadp       vs38,   vs0,    vs14
 
-       xsmaddasp       vs39,   vs0,    vs15
+       xsmaddadp       vs39,   vs0,    vs15
 
 
 .endm
@@ -3494,35 +2233,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
-       xsmaddasp       vs33,   vs4,    vs17
+       xsmaddadp       vs33,   vs4,    vs17
 
-       xsmaddasp       vs34,   vs4,    vs18
+       xsmaddadp       vs34,   vs4,    vs18
 
-       xsmaddasp       vs35,   vs4,    vs19
+       xsmaddadp       vs35,   vs4,    vs19
 
-       xsmaddasp       vs36,   vs4,    vs20
+       xsmaddadp       vs36,   vs4,    vs20
 
-       xsmaddasp       vs37,   vs4,    vs21
+       xsmaddadp       vs37,   vs4,    vs21
 
-       xsmaddasp       vs38,   vs4,    vs22
+       xsmaddadp       vs38,   vs4,    vs22
 
-       xsmaddasp       vs39,   vs4,    vs23
+       xsmaddadp       vs39,   vs4,    vs23
 
 
 .endm
@@ -3530,21 +2270,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL8x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
-       xsmaddasp       vs33,   vs4,    vs17
+       xsmaddadp       vs33,   vs4,    vs17
 
-       xsmaddasp       vs34,   vs4,    vs18
+       xsmaddadp       vs34,   vs4,    vs18
 
-       xsmaddasp       vs35,   vs4,    vs19
+       xsmaddadp       vs35,   vs4,    vs19
 
-       xsmaddasp       vs36,   vs4,    vs20
+       xsmaddadp       vs36,   vs4,    vs20
 
-       xsmaddasp       vs37,   vs4,    vs21
+       xsmaddadp       vs37,   vs4,    vs21
 
-       xsmaddasp       vs38,   vs4,    vs22
+       xsmaddadp       vs38,   vs4,    vs22
 
-       xsmaddasp       vs39,   vs4,    vs23
+       xsmaddadp       vs39,   vs4,    vs23
 
 
 .endm
@@ -3559,35 +2299,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
-       xsmulsp         vs33,   vs0,    vs9
+       xsmuldp         vs33,   vs0,    vs9
 
-       xsmulsp         vs34,   vs0,    vs10
+       xsmuldp         vs34,   vs0,    vs10
 
-       xsmulsp         vs35,   vs0,    vs11
+       xsmuldp         vs35,   vs0,    vs11
 
-       xsmulsp         vs36,   vs0,    vs12
+       xsmuldp         vs36,   vs0,    vs12
 
-       xsmulsp         vs37,   vs0,    vs13
+       xsmuldp         vs37,   vs0,    vs13
 
-       xsmulsp         vs38,   vs0,    vs14
+       xsmuldp         vs38,   vs0,    vs14
 
-       xsmulsp         vs39,   vs0,    vs15
+       xsmuldp         vs39,   vs0,    vs15
 
 
 .endm
@@ -3602,35 +2343,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            T1,     T1,     16
+       addi            T1,     T1,     64
 
        lxsspx          vs12,   o0,     T1
-       lxsspx          vs13,   o4,     T1
-       lxsspx          vs14,   o8,     T1
-       lxsspx          vs15,   o12,    T1
+       lxsspx          vs13,   o16,    T1
+       lxsspx          vs14,   o32,    T1
+       lxsspx          vs15,   o48,    T1
 
-       addi            BO,     BO,     32
+
+       addi            BO,     BO,     128
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
-       xsmaddasp       vs33,   vs0,    vs9
+       xsmaddadp       vs33,   vs0,    vs9
 
-       xsmaddasp       vs34,   vs0,    vs10
+       xsmaddadp       vs34,   vs0,    vs10
 
-       xsmaddasp       vs35,   vs0,    vs11
+       xsmaddadp       vs35,   vs0,    vs11
 
-       xsmaddasp       vs36,   vs0,    vs12
+       xsmaddadp       vs36,   vs0,    vs12
 
-       xsmaddasp       vs37,   vs0,    vs13
+       xsmaddadp       vs37,   vs0,    vs13
 
-       xsmaddasp       vs38,   vs0,    vs14
+       xsmaddadp       vs38,   vs0,    vs14
 
-       xsmaddasp       vs39,   vs0,    vs15
+       xsmaddadp       vs39,   vs0,    vs15
 
 
 .endm
@@ -3646,14 +2388,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3668,14 +2405,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3690,14 +2422,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs34,   alpha_r
-
+       xsmuldp         vs0,    vs34,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs34,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs34,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3712,14 +2439,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs35,   alpha_r
-
+       xsmuldp         vs0,    vs35,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs35,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs35,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3734,14 +2456,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs36,   alpha_r
-
+       xsmuldp         vs0,    vs36,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs36,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs36,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3756,14 +2473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs37,   alpha_r
-
+       xsmuldp         vs0,    vs37,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs37,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs37,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3778,14 +2490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs38,   alpha_r
-
+       xsmuldp         vs0,    vs38,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs38,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs38,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3800,14 +2507,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs39,   alpha_r
-
+       xsmuldp         vs0,    vs39,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs39,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs39,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -3832,14 +2534,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 .endm
 
@@ -3853,14 +2555,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -3896,14 +2598,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -3939,14 +2641,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -4008,14 +2710,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -4051,14 +2753,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -4097,106 +2799,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -4214,106 +2828,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs36,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs37,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs38,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs39,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -4331,106 +2857,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs40,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs41,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs42,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs43,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+       xvmulsp         vs2,    vs42,   alpha_vr
+       xvmulsp         vs3,    vs43,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+       xvmaddasp       vs2,    vs42,   alpha_vr
+       xvmaddasp       vs3,    vs43,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -4448,105 +2886,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs44,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs45,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs46,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs47,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
-#endif
-
-
-
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+       xvmulsp         vs2,    vs46,   alpha_vr
+       xvmulsp         vs3,    vs47,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+       xvmaddasp       vs2,    vs46,   alpha_vr
+       xvmaddasp       vs3,    vs47,   alpha_vr
+#endif
 
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
@@ -4571,14 +2921,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 .endm
 
@@ -4590,14 +2940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -4623,14 +2973,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -4656,14 +3006,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -4707,14 +3057,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -4740,14 +3090,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -4776,58 +3126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -4841,58 +3147,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -4906,58 +3168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs36,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs37,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -4971,58 +3189,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs38,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs39,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs38,   alpha_vr
+       xvmulsp         vs1,    vs39,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs38,   alpha_vr
+       xvmaddasp       vs1,    vs39,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -5043,14 +3217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 .endm
 
@@ -5061,14 +3235,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -5089,14 +3263,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
-       xxspltw         vs18,   vs28,   2
-       xxspltw         vs19,   vs28,   3
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
+       lxvw4x          vs18,   o32,    T1
+       lxvw4x          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -5117,14 +3291,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -5159,14 +3333,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -5187,14 +3361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
-       xxspltw         vs10,   vs28,   2
-       xxspltw         vs11,   vs28,   3
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
+       lxvw4x          vs10,   o32,    T1
+       lxvw4x          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+       addi            BO,     BO,     64
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -5218,34 +3392,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -5257,34 +3409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -5296,34 +3426,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs34,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs34,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -5335,34 +3443,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs35,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -5386,11 +3472,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 .endm
 
@@ -5405,24 +3492,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
-       xsmulsp         vs34,   vs0,    vs9
-       xsmulsp         vs35,   vs1,    vs9
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
 
-       xsmulsp         vs36,   vs0,    vs10
-       xsmulsp         vs37,   vs1,    vs10
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
 
-       xsmulsp         vs38,   vs0,    vs11
-       xsmulsp         vs39,   vs1,    vs11
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
 
 
 .endm
@@ -5438,24 +3526,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
-       xsmaddasp       vs34,   vs0,    vs9
-       xsmaddasp       vs35,   vs1,    vs9
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
 
-       xsmaddasp       vs36,   vs0,    vs10
-       xsmaddasp       vs37,   vs1,    vs10
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
 
-       xsmaddasp       vs38,   vs0,    vs11
-       xsmaddasp       vs39,   vs1,    vs11
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
 
 
 .endm
@@ -5471,24 +3560,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
-       xsmaddasp       vs34,   vs4,    vs17
-       xsmaddasp       vs35,   vs5,    vs17
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
 
-       xsmaddasp       vs36,   vs4,    vs18
-       xsmaddasp       vs37,   vs5,    vs18
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
 
-       xsmaddasp       vs38,   vs4,    vs19
-       xsmaddasp       vs39,   vs5,    vs19
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
 
 
 .endm
@@ -5496,17 +3586,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL4x2_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
-       xsmaddasp       vs34,   vs4,    vs17
-       xsmaddasp       vs35,   vs5,    vs17
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
 
-       xsmaddasp       vs36,   vs4,    vs18
-       xsmaddasp       vs37,   vs5,    vs18
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
 
-       xsmaddasp       vs38,   vs4,    vs19
-       xsmaddasp       vs39,   vs5,    vs19
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
 
 
 .endm
@@ -5522,24 +3612,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
-       xsmulsp         vs34,   vs0,    vs9
-       xsmulsp         vs35,   vs1,    vs9
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
 
-       xsmulsp         vs36,   vs0,    vs10
-       xsmulsp         vs37,   vs1,    vs10
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
 
-       xsmulsp         vs38,   vs0,    vs11
-       xsmulsp         vs39,   vs1,    vs11
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
 
 
 .endm
@@ -5555,24 +3646,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
-       xsmaddasp       vs34,   vs0,    vs9
-       xsmaddasp       vs35,   vs1,    vs9
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
 
-       xsmaddasp       vs36,   vs0,    vs10
-       xsmaddasp       vs37,   vs1,    vs10
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
 
-       xsmaddasp       vs38,   vs0,    vs11
-       xsmaddasp       vs39,   vs1,    vs11
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
 
 
 .endm
@@ -5589,17 +3681,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-       xsmulsp         vs1,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5616,17 +3702,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs34,   alpha_r
-       xsmulsp         vs1,    vs35,   alpha_r
-
+       xsmuldp         vs0,    vs34,   alpha_r
+       xsmuldp         vs1,    vs35,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs34,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs35,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs34,   alpha_r
+       xsmaddadp       vs1,    vs35,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5643,17 +3723,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs36,   alpha_r
-       xsmulsp         vs1,    vs37,   alpha_r
-
+       xsmuldp         vs0,    vs36,   alpha_r
+       xsmuldp         vs1,    vs37,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs36,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs37,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs36,   alpha_r
+       xsmaddadp       vs1,    vs37,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5670,17 +3744,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs38,   alpha_r
-       xsmulsp         vs1,    vs39,   alpha_r
-
+       xsmuldp         vs0,    vs38,   alpha_r
+       xsmuldp         vs1,    vs39,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs38,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs39,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs38,   alpha_r
+       xsmaddadp       vs1,    vs39,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5706,11 +3774,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 .endm
 
@@ -5724,20 +3793,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
-       xsmulsp         vs33,   vs0,    vs9
+       xsmuldp         vs33,   vs0,    vs9
 
-       xsmulsp         vs34,   vs0,    vs10
+       xsmuldp         vs34,   vs0,    vs10
 
-       xsmulsp         vs35,   vs0,    vs11
+       xsmuldp         vs35,   vs0,    vs11
 
 
 .endm
@@ -5752,20 +3822,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
-       lxsspx          vs18,   o8,     T1
-       lxsspx          vs19,   o12,    T1
+       lxsspx          vs17,   o16,    T1
+       lxsspx          vs18,   o32,    T1
+       lxsspx          vs19,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
-       xsmaddasp       vs33,   vs0,    vs9
+       xsmaddadp       vs33,   vs0,    vs9
 
-       xsmaddasp       vs34,   vs0,    vs10
+       xsmaddadp       vs34,   vs0,    vs10
 
-       xsmaddasp       vs35,   vs0,    vs11
+       xsmaddadp       vs35,   vs0,    vs11
 
 
 .endm
@@ -5780,20 +3851,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
-       xsmaddasp       vs33,   vs4,    vs17
+       xsmaddadp       vs33,   vs4,    vs17
 
-       xsmaddasp       vs34,   vs4,    vs18
+       xsmaddadp       vs34,   vs4,    vs18
 
-       xsmaddasp       vs35,   vs4,    vs19
+       xsmaddadp       vs35,   vs4,    vs19
 
 
 .endm
@@ -5801,13 +3873,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL4x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
-       xsmaddasp       vs33,   vs4,    vs17
+       xsmaddadp       vs33,   vs4,    vs17
 
-       xsmaddasp       vs34,   vs4,    vs18
+       xsmaddadp       vs34,   vs4,    vs18
 
-       xsmaddasp       vs35,   vs4,    vs19
+       xsmaddadp       vs35,   vs4,    vs19
 
 
 .endm
@@ -5822,20 +3894,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
-       xsmulsp         vs33,   vs0,    vs9
+       xsmuldp         vs33,   vs0,    vs9
 
-       xsmulsp         vs34,   vs0,    vs10
+       xsmuldp         vs34,   vs0,    vs10
 
-       xsmulsp         vs35,   vs0,    vs11
+       xsmuldp         vs35,   vs0,    vs11
 
 
 .endm
@@ -5850,20 +3923,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
-       lxsspx          vs10,   o8,     T1
-       lxsspx          vs11,   o12,    T1
+       lxsspx          vs9,    o16,    T1
+       lxsspx          vs10,   o32,    T1
+       lxsspx          vs11,   o48,    T1
 
-       addi            BO,     BO,     16
+
+       addi            BO,     BO,     64
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
-       xsmaddasp       vs33,   vs0,    vs9
+       xsmaddadp       vs33,   vs0,    vs9
 
-       xsmaddasp       vs34,   vs0,    vs10
+       xsmaddadp       vs34,   vs0,    vs10
 
-       xsmaddasp       vs35,   vs0,    vs11
+       xsmaddadp       vs35,   vs0,    vs11
 
 
 .endm
@@ -5879,14 +3953,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5901,14 +3970,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5923,14 +3987,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs34,   alpha_r
-
+       xsmuldp         vs0,    vs34,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs34,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs34,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5945,14 +4004,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs35,   alpha_r
-
+       xsmuldp         vs0,    vs35,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs35,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs35,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -5977,12 +4031,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -5996,12 +4050,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -6027,12 +4081,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -6058,12 +4112,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -6105,12 +4159,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -6136,12 +4190,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -6170,223 +4224,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
-#endif
-
-
-
-
-       stxvw4x         vs0,    o0,     T1
-       stxvw4x         vs1,    o16,    T1
-       stxvw4x         vs2,    o32,    T1
-       stxvw4x         vs3,    o48,    T1
-
-       add             T1,     T1,     LDC
-
-
-#ifndef TRMMKERNEL
-
-       lxvw4x          vs0,    o0,     T1
-       lxvw4x          vs1,    o16,    T1
-       lxvw4x          vs2,    o32,    T1
-       lxvw4x          vs3,    o48,    T1
-
-#endif
-
-
-       stxvw4x         vs36,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs37,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
 
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
+       add             T1,     T1,     LDC
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
+#ifndef TRMMKERNEL
 
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
 
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
+#endif
 
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -6410,12 +4288,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -6427,12 +4305,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -6452,12 +4330,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -6477,12 +4355,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -6514,12 +4392,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -6539,12 +4417,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -6567,58 +4445,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -6632,58 +4466,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -6704,12 +4494,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 .endm
 
@@ -6720,12 +4510,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -6742,12 +4532,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
-       xxspltw         vs17,   vs28,   1
+       lxvw4x          vs16,   o0,     T1
+       lxvw4x          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -6764,12 +4554,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -6796,12 +4586,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -6818,12 +4608,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
-       xxspltw         vs9,    vs28,   1
+       lxvw4x          vs8,    o0,     T1
+       lxvw4x          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+       addi            BO,     BO,     32
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -6843,34 +4633,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -6882,34 +4650,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -6933,9 +4679,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 .endm
 
@@ -6950,16 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
+       lxsspx          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
-       xsmulsp         vs34,   vs0,    vs9
-       xsmulsp         vs35,   vs1,    vs9
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
 
 
 .endm
@@ -6975,16 +4723,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
+       lxsspx          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
-       xsmaddasp       vs34,   vs0,    vs9
-       xsmaddasp       vs35,   vs1,    vs9
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
 
 
 .endm
@@ -7000,16 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
-       xsmaddasp       vs34,   vs4,    vs17
-       xsmaddasp       vs35,   vs5,    vs17
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
 
 
 .endm
@@ -7017,11 +4767,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL2x2_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
-       xsmaddasp       vs34,   vs4,    vs17
-       xsmaddasp       vs35,   vs5,    vs17
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
 
 
 .endm
@@ -7037,16 +4787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
-       xsmulsp         vs34,   vs0,    vs9
-       xsmulsp         vs35,   vs1,    vs9
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
 
 
 .endm
@@ -7062,16 +4813,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
-       xsmaddasp       vs34,   vs0,    vs9
-       xsmaddasp       vs35,   vs1,    vs9
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
 
 
 .endm
@@ -7088,17 +4840,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-       xsmulsp         vs1,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -7115,17 +4861,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs34,   alpha_r
-       xsmulsp         vs1,    vs35,   alpha_r
-
+       xsmuldp         vs0,    vs34,   alpha_r
+       xsmuldp         vs1,    vs35,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs34,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs35,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs34,   alpha_r
+       xsmaddadp       vs1,    vs35,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -7151,9 +4891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 .endm
 
@@ -7167,14 +4908,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
+       lxsspx          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
-       xsmulsp         vs33,   vs0,    vs9
+       xsmuldp         vs33,   vs0,    vs9
 
 
 .endm
@@ -7189,14 +4931,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs16,   o0,     T1
-       lxsspx          vs17,   o4,     T1
+       lxsspx          vs17,   o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
-       xsmaddasp       vs33,   vs0,    vs9
+       xsmaddadp       vs33,   vs0,    vs9
 
 
 .endm
@@ -7211,14 +4954,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
-       xsmaddasp       vs33,   vs4,    vs17
+       xsmaddadp       vs33,   vs4,    vs17
 
 
 .endm
@@ -7226,9 +4970,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL2x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
-       xsmaddasp       vs33,   vs4,    vs17
+       xsmaddadp       vs33,   vs4,    vs17
 
 
 .endm
@@ -7243,14 +4987,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
-       xsmulsp         vs33,   vs0,    vs9
+       xsmuldp         vs33,   vs0,    vs9
 
 
 .endm
@@ -7265,14 +5010,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              T1,     BO
 
        lxsspx          vs8,    o0,     T1
-       lxsspx          vs9,    o4,     T1
+       lxsspx          vs9,    o16,    T1
 
-       addi            BO,     BO,     8
+
+       addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
-       xsmaddasp       vs33,   vs0,    vs9
+       xsmaddadp       vs33,   vs0,    vs9
 
 
 .endm
@@ -7288,14 +5034,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -7310,14 +5051,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -7342,11 +5078,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 .endm
 
@@ -7360,11 +5096,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
+       lxvw4x          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -7385,11 +5121,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
+       lxvw4x          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -7410,11 +5146,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -7446,11 +5182,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -7471,11 +5207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     64
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -7499,106 +5235,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
-#endif
-
-       stxvw4x         vs34,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs2,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs2,    vs2,    vs28
-#endif
-
-       stxvw4x         vs35,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs3,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs3,    vs3,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
        stxvw4x         vs2,    o32,    T1
@@ -7622,11 +5270,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 .endm
 
@@ -7638,11 +5286,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
+       lxvw4x          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -7659,11 +5307,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
+       lxvw4x          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -7680,11 +5328,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -7710,11 +5358,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -7731,11 +5379,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     32
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -7755,58 +5403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
-#ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
-#else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
-#endif
-
-       stxvw4x         vs33,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs1,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs1,    vs1,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
        stxvw4x         vs1,    o16,    T1
 
@@ -7827,11 +5431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 .endm
 
@@ -7842,11 +5446,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
+       lxvw4x          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -7861,11 +5465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs16,   vs28,   0
+       lxvw4x          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -7880,11 +5484,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs4,    vs16
@@ -7907,11 +5511,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmulsp         vs32,   vs0,    vs8
@@ -7926,11 +5530,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        addi            AO,     AO,     16
 
-       lxvw4x          vs28,   o0,     BO
+       mr              T1,     BO
 
-       xxspltw         vs8,    vs28,   0
+       lxvw4x          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+       addi            BO,     BO,     16
 
 
        xvmaddasp       vs32,   vs0,    vs8
@@ -7948,34 +5552,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-
-       stxvw4x         vs32,   o0,     TBUFFER
-
-       lxsspx          vs4,    o0,     TBUFFER
-       lxsspx          vs5,    o4,     TBUFFER
-       lxsspx          vs6,    o8,     TBUFFER
-       lxsspx          vs7,    o12,    TBUFFER
-
-       xsmulsp         vs4,    vs4,    alpha_r
-       xsmulsp         vs5,    vs5,    alpha_r
-       xsmulsp         vs6,    vs6,    alpha_r
-       xsmulsp         vs7,    vs7,    alpha_r
-
-       stxsspx         vs4,    o0,     TBUFFER
-       stxsspx         vs5,    o4,     TBUFFER
-       stxsspx         vs6,    o8,     TBUFFER
-       stxsspx         vs7,    o12,    TBUFFER
-
 #ifdef TRMMKERNEL
-       lxvw4x          vs0,    o0,     TBUFFER
+       xvmulsp         vs0,    vs32,   alpha_vr
 #else
-       lxvw4x          vs28,   o0,     TBUFFER
-       xvaddsp         vs0,    vs0,    vs28
+       xvmaddasp       vs0,    vs32,   alpha_vr
 #endif
 
-
-
-
        stxvw4x         vs0,    o0,     T1
 
        add             T1,     T1,     LDC
@@ -8000,7 +5582,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 .endm
 
@@ -8016,11 +5599,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
 
 .endm
@@ -8037,11 +5621,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
 
 .endm
@@ -8058,11 +5643,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
 
 .endm
@@ -8070,8 +5656,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL1x2_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
-       xsmaddasp       vs33,   vs5,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
 
 
 .endm
@@ -8088,11 +5674,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmulsp         vs32,   vs0,    vs8
-       xsmulsp         vs33,   vs1,    vs8
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
 
 
 .endm
@@ -8109,11 +5696,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs0,    vs8
-       xsmaddasp       vs33,   vs1,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
 
 
 .endm
@@ -8130,17 +5718,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-       xsmulsp         vs1,    vs33,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-       xsmulsp         vs28,   vs33,   alpha_r
-       xsaddsp         vs1,    vs1,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
@@ -8167,7 +5749,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 .endm
 
@@ -8182,10 +5765,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
 
 .endm
@@ -8201,10 +5785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs16,   o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
 
 .endm
@@ -8220,10 +5805,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
 
 .endm
@@ -8231,7 +5817,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL1x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16
+       xsmaddadp       vs32,   vs4,    vs16
 
 
 .endm
@@ -8247,10 +5833,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmulsp         vs32,   vs0,    vs8
+       xsmuldp         vs32,   vs0,    vs8
 
 
 .endm
@@ -8266,10 +5853,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        lxsspx          vs8,    o0,     T1
 
-       addi            BO,     BO,     4
+
+       addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs0,    vs8
+       xsmaddadp       vs32,   vs0,    vs8
 
 
 .endm
@@ -8285,14 +5873,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-
-       xsmulsp         vs0,    vs32,   alpha_r
-
+       xsmuldp         vs0,    vs32,   alpha_r
 #else
-
-       xsmulsp         vs28,   vs32,   alpha_r
-       xsaddsp         vs0,    vs0,    vs28
-
+       xsmaddadp       vs0,    vs32,   alpha_r
 #endif
 
        stxsspx         vs0,    o0,     T1
index 5e607c5..f756d5d 100644 (file)
@@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
 /*********************************************************************/
@@ -128,6 +128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define alpha_r vs30
+#define alpha_vr vs31
 
 #define o0     0
 
@@ -152,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define PRE    r30
 #define T2     r31
 
-#include "sgemm_macros_16x8_power8.S"
+#include "strmm_macros_16x8_power8.S"
 
 
 #ifndef NEEDPARAM
@@ -264,11 +265,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        cmpwi   cr0, M, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, N, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, K, 0
-       ble     .L999_H1
+       ble     L999_H1
 
        li      PRE, 256 
        li      o4 , 4
@@ -280,16 +281,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi    TBUFFER, SP, 320
 
         addi    T1, SP, 300
-        stfs    f1, 0(T1)
-
-        lxsspx  alpha_r, 0, T1
+        stxsspx    f1, o0 , T1
+        stxsspx    f1, o4 , T1
+        stxsspx    f1, o8 , T1
+        stxsspx    f1, o12 , T1
 
+       lxsspx     alpha_r, o0,  T1
+        lxvw4x     alpha_vr, o0, T1
 
 
 
 #include "strmm_logic_16x8_power8.S"
 
-.L999:
+L999:
        addi    r3, 0, 0
 
        lfd     f14,    0(SP)
index 8ec11f1..fb2d3f9 100644 (file)
@@ -26,18 +26,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
-
        srawi.          J,      N,      3
-       ble             .LSTRMM_L8_END
+       ble             STRMM_L8_END
 
-.LSTRMM_L8_BEGIN:
+STRMM_L8_BEGIN:
 
        mr              CO,     C
        mr              AO,     A
@@ -49,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      4
-       ble             .LSTRMM_L8x16_END
+       ble             STRMM_L8x16_END
 
-.LSTRMM_L8x16_BEGIN:
+STRMM_L8x16_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -78,11 +77,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L8x16_SUB0
+       ble             STRMM_L8x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L8x16_SUB4
+       ble             STRMM_L8x16_SUB4
 
-.LSTRMM_L8x16_LOOP_START:
+STRMM_L8x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD8x16_1
@@ -105,11 +104,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x16_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L8x16_LOOP_END
+       ble             STRMM_L8x16_LOOP_END
 
        .align 5
 
-.LSTRMM_L8x16_LOOP:
+STRMM_L8x16_LOOP:
 
        dcbt            AO,     PRE
        KERNEL8x16_1
@@ -130,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x16_LOOP
+       bgt             STRMM_L8x16_LOOP
 
-.LSTRMM_L8x16_LOOP_END:
+STRMM_L8x16_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL8x16_1
@@ -151,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x16_1
        KERNEL8x16_E2
 
-       b               .LSTRMM_L8x16_SUB1
+       b               STRMM_L8x16_SUB1
 
-.LSTRMM_L8x16_SUB4:
+STRMM_L8x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL8x16_SUBI1
@@ -169,31 +168,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x16_SUB1
        KERNEL8x16_SUB1
 
-       b               .LSTRMM_L8x16_SUB1
+       b               STRMM_L8x16_SUB1
 
-.LSTRMM_L8x16_SUB0:
+STRMM_L8x16_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL8x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L8x16_SAVE
-       b               .LSTRMM_L8x16_SUB2
+       ble             STRMM_L8x16_SAVE
+       b               STRMM_L8x16_SUB2
 
-.LSTRMM_L8x16_SUB1:
+STRMM_L8x16_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L8x16_SAVE
+       ble             STRMM_L8x16_SAVE
 
-.LSTRMM_L8x16_SUB2:
+STRMM_L8x16_SUB2:
 
        KERNEL8x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x16_SUB2
+       bgt             STRMM_L8x16_SUB2
 
-.LSTRMM_L8x16_SAVE:
+STRMM_L8x16_SAVE:
 
        SAVE8x16
 
@@ -211,16 +210,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LSTRMM_L8x16_BEGIN
+       bgt             STRMM_L8x16_BEGIN
 
-.LSTRMM_L8x16_END:
+STRMM_L8x16_END:
 
-.LSTRMM_L8x8_BEGIN:
+STRMM_L8x8_BEGIN:
        andi.           T2,     M,      15
-       ble             .LSTRMM_L8x1_END
+       ble             STRMM_L8x1_END
 
        andi.           T1,     M,      8
-       ble             .LSTRMM_L8x8_END
+       ble             STRMM_L8x8_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -246,11 +245,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L8x8_SUB0
+       ble             STRMM_L8x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L8x8_SUB4
+       ble             STRMM_L8x8_SUB4
 
-.LSTRMM_L8x8_LOOP_START:
+STRMM_L8x8_LOOP_START:
 
        LOAD8x8_1
        KERNEL8x8_I1
@@ -264,11 +263,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L8x8_LOOP_END
+       ble             STRMM_L8x8_LOOP_END
 
        .align 5
 
-.LSTRMM_L8x8_LOOP:
+STRMM_L8x8_LOOP:
 
        KERNEL8x8_1
        KERNEL8x8_2
@@ -281,9 +280,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x8_LOOP
+       bgt             STRMM_L8x8_LOOP
 
-.LSTRMM_L8x8_LOOP_END:
+STRMM_L8x8_LOOP_END:
 
        KERNEL8x8_1
        KERNEL8x8_2
@@ -295,9 +294,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_1
        KERNEL8x8_E2
 
-       b               .LSTRMM_L8x8_SUB1
+       b               STRMM_L8x8_SUB1
 
-.LSTRMM_L8x8_SUB4:
+STRMM_L8x8_SUB4:
 
        KERNEL8x8_SUBI1
        KERNEL8x8_SUB1
@@ -309,31 +308,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x8_SUB1
        KERNEL8x8_SUB1
 
-       b               .LSTRMM_L8x8_SUB1
+       b               STRMM_L8x8_SUB1
 
-.LSTRMM_L8x8_SUB0:
+STRMM_L8x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL8x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L8x8_SAVE
-       b               .LSTRMM_L8x8_SUB2
+       ble             STRMM_L8x8_SAVE
+       b               STRMM_L8x8_SUB2
 
-.LSTRMM_L8x8_SUB1:
+STRMM_L8x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L8x8_SAVE
+       ble             STRMM_L8x8_SAVE
 
-.LSTRMM_L8x8_SUB2:
+STRMM_L8x8_SUB2:
 
        KERNEL8x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x8_SUB2
+       bgt             STRMM_L8x8_SUB2
 
-.LSTRMM_L8x8_SAVE:
+STRMM_L8x8_SAVE:
 
        SAVE8x8
 
@@ -350,12 +349,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L8x8_END:
+STRMM_L8x8_END:
 
-.LSTRMM_L8x4_BEGIN:
+STRMM_L8x4_BEGIN:
 
        andi.           T1,     M,      4
-       ble             .LSTRMM_L8x4_END
+       ble             STRMM_L8x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -381,11 +380,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L8x4_SUB0
+       ble             STRMM_L8x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L8x4_SUB4
+       ble             STRMM_L8x4_SUB4
 
-.LSTRMM_L8x4_LOOP_START:
+STRMM_L8x4_LOOP_START:
 
        LOAD8x4_1
        KERNEL8x4_I1
@@ -399,11 +398,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L8x4_LOOP_END
+       ble             STRMM_L8x4_LOOP_END
 
        .align 5
 
-.LSTRMM_L8x4_LOOP:
+STRMM_L8x4_LOOP:
 
        KERNEL8x4_1
        KERNEL8x4_2
@@ -416,9 +415,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x4_LOOP
+       bgt             STRMM_L8x4_LOOP
 
-.LSTRMM_L8x4_LOOP_END:
+STRMM_L8x4_LOOP_END:
 
        KERNEL8x4_1
        KERNEL8x4_2
@@ -430,9 +429,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_1
        KERNEL8x4_E2
 
-       b               .LSTRMM_L8x4_SUB1
+       b               STRMM_L8x4_SUB1
 
-.LSTRMM_L8x4_SUB4:
+STRMM_L8x4_SUB4:
 
        KERNEL8x4_SUBI1
        KERNEL8x4_SUB1
@@ -444,31 +443,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x4_SUB1
        KERNEL8x4_SUB1
 
-       b               .LSTRMM_L8x4_SUB1
+       b               STRMM_L8x4_SUB1
 
-.LSTRMM_L8x4_SUB0:
+STRMM_L8x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL8x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L8x4_SAVE
-       b               .LSTRMM_L8x4_SUB2
+       ble             STRMM_L8x4_SAVE
+       b               STRMM_L8x4_SUB2
 
-.LSTRMM_L8x4_SUB1:
+STRMM_L8x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L8x4_SAVE
+       ble             STRMM_L8x4_SAVE
 
-.LSTRMM_L8x4_SUB2:
+STRMM_L8x4_SUB2:
 
        KERNEL8x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x4_SUB2
+       bgt             STRMM_L8x4_SUB2
 
-.LSTRMM_L8x4_SAVE:
+STRMM_L8x4_SAVE:
 
        SAVE8x4
 
@@ -485,12 +484,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L8x4_END:
+STRMM_L8x4_END:
 
-.LSTRMM_L8x2_BEGIN:
+STRMM_L8x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LSTRMM_L8x2_END
+       ble             STRMM_L8x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -516,11 +515,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L8x2_SUB0
+       ble             STRMM_L8x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L8x2_SUB4
+       ble             STRMM_L8x2_SUB4
 
-.LSTRMM_L8x2_LOOP_START:
+STRMM_L8x2_LOOP_START:
 
        LOAD8x2_1
        KERNEL8x2_I1
@@ -534,11 +533,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L8x2_LOOP_END
+       ble             STRMM_L8x2_LOOP_END
 
        .align 5
 
-.LSTRMM_L8x2_LOOP:
+STRMM_L8x2_LOOP:
 
        KERNEL8x2_1
        KERNEL8x2_2
@@ -551,9 +550,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x2_LOOP
+       bgt             STRMM_L8x2_LOOP
 
-.LSTRMM_L8x2_LOOP_END:
+STRMM_L8x2_LOOP_END:
 
        KERNEL8x2_1
        KERNEL8x2_2
@@ -565,9 +564,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_1
        KERNEL8x2_E2
 
-       b               .LSTRMM_L8x2_SUB1
+       b               STRMM_L8x2_SUB1
 
-.LSTRMM_L8x2_SUB4:
+STRMM_L8x2_SUB4:
 
        KERNEL8x2_SUBI1
        KERNEL8x2_SUB1
@@ -579,31 +578,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x2_SUB1
        KERNEL8x2_SUB1
 
-       b               .LSTRMM_L8x2_SUB1
+       b               STRMM_L8x2_SUB1
 
-.LSTRMM_L8x2_SUB0:
+STRMM_L8x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL8x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L8x2_SAVE
-       b               .LSTRMM_L8x2_SUB2
+       ble             STRMM_L8x2_SAVE
+       b               STRMM_L8x2_SUB2
 
-.LSTRMM_L8x2_SUB1:
+STRMM_L8x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L8x2_SAVE
+       ble             STRMM_L8x2_SAVE
 
-.LSTRMM_L8x2_SUB2:
+STRMM_L8x2_SUB2:
 
        KERNEL8x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x2_SUB2
+       bgt             STRMM_L8x2_SUB2
 
-.LSTRMM_L8x2_SAVE:
+STRMM_L8x2_SAVE:
 
        SAVE8x2
 
@@ -620,12 +619,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L8x2_END:
+STRMM_L8x2_END:
 
-.LSTRMM_L8x1_BEGIN:
+STRMM_L8x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LSTRMM_L8x1_END
+       ble             STRMM_L8x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -651,11 +650,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L8x1_SUB0
+       ble             STRMM_L8x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L8x1_SUB4
+       ble             STRMM_L8x1_SUB4
 
-.LSTRMM_L8x1_LOOP_START:
+STRMM_L8x1_LOOP_START:
 
        LOAD8x1_1
        KERNEL8x1_I1
@@ -669,11 +668,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L8x1_LOOP_END
+       ble             STRMM_L8x1_LOOP_END
 
        .align 5
 
-.LSTRMM_L8x1_LOOP:
+STRMM_L8x1_LOOP:
 
        KERNEL8x1_1
        KERNEL8x1_2
@@ -686,9 +685,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x1_LOOP
+       bgt             STRMM_L8x1_LOOP
 
-.LSTRMM_L8x1_LOOP_END:
+STRMM_L8x1_LOOP_END:
 
        KERNEL8x1_1
        KERNEL8x1_2
@@ -700,9 +699,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_1
        KERNEL8x1_E2
 
-       b               .LSTRMM_L8x1_SUB1
+       b               STRMM_L8x1_SUB1
 
-.LSTRMM_L8x1_SUB4:
+STRMM_L8x1_SUB4:
 
        KERNEL8x1_SUBI1
        KERNEL8x1_SUB1
@@ -714,31 +713,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL8x1_SUB1
        KERNEL8x1_SUB1
 
-       b               .LSTRMM_L8x1_SUB1
+       b               STRMM_L8x1_SUB1
 
-.LSTRMM_L8x1_SUB0:
+STRMM_L8x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL8x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L8x1_SAVE
-       b               .LSTRMM_L8x1_SUB2
+       ble             STRMM_L8x1_SAVE
+       b               STRMM_L8x1_SUB2
 
-.LSTRMM_L8x1_SUB1:
+STRMM_L8x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L8x1_SAVE
+       ble             STRMM_L8x1_SAVE
 
-.LSTRMM_L8x1_SUB2:
+STRMM_L8x1_SUB2:
 
        KERNEL8x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L8x1_SUB2
+       bgt             STRMM_L8x1_SUB2
 
-.LSTRMM_L8x1_SAVE:
+STRMM_L8x1_SAVE:
 
        SAVE8x1
 
@@ -755,7 +754,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L8x1_END:
+STRMM_L8x1_END:
 
        slwi            T1,     K,      5
        add             B,      B,      T1
@@ -766,23 +765,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          J,      J,      -1
-       bgt             .LSTRMM_L8_BEGIN
+       bgt             STRMM_L8_BEGIN
 
        andi.           T2,     N,      7
-       ble             .L999
+       ble             L999
 
-.LSTRMM_L8_END:
+STRMM_L8_END:
 
-       b               .LSTRMM_L4_BEGIN
+       b               STRMM_L4_BEGIN
 
-.L999_H1:
+L999_H1:
 
-       b               .L999
+       b               L999
 
-.LSTRMM_L4_BEGIN:
+STRMM_L4_BEGIN:
 
        andi.           T1,     N,      4
-       ble             .LSTRMM_L4_END
+       ble             STRMM_L4_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       2
@@ -793,9 +792,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      4
-       ble             .LSTRMM_L4x16_END
+       ble             STRMM_L4x16_END
 
-.LSTRMM_L4x16_BEGIN:
+STRMM_L4x16_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -822,11 +821,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L4x16_SUB0
+       ble             STRMM_L4x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L4x16_SUB4
+       ble             STRMM_L4x16_SUB4
 
-.LSTRMM_L4x16_LOOP_START:
+STRMM_L4x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD4x16_1
@@ -849,11 +848,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L4x16_LOOP_END
+       ble             STRMM_L4x16_LOOP_END
 
        .align 5
 
-.LSTRMM_L4x16_LOOP:
+STRMM_L4x16_LOOP:
 
        dcbt            AO,     PRE
        KERNEL4x16_1
@@ -874,9 +873,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x16_LOOP
+       bgt             STRMM_L4x16_LOOP
 
-.LSTRMM_L4x16_LOOP_END:
+STRMM_L4x16_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL4x16_1
@@ -895,9 +894,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_1
        KERNEL4x16_E2
 
-       b               .LSTRMM_L4x16_SUB1
+       b               STRMM_L4x16_SUB1
 
-.LSTRMM_L4x16_SUB4:
+STRMM_L4x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL4x16_SUBI1
@@ -913,31 +912,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x16_SUB1
        KERNEL4x16_SUB1
 
-       b               .LSTRMM_L4x16_SUB1
+       b               STRMM_L4x16_SUB1
 
-.LSTRMM_L4x16_SUB0:
+STRMM_L4x16_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L4x16_SAVE
-       b               .LSTRMM_L4x16_SUB2
+       ble             STRMM_L4x16_SAVE
+       b               STRMM_L4x16_SUB2
 
-.LSTRMM_L4x16_SUB1:
+STRMM_L4x16_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L4x16_SAVE
+       ble             STRMM_L4x16_SAVE
 
-.LSTRMM_L4x16_SUB2:
+STRMM_L4x16_SUB2:
 
        KERNEL4x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x16_SUB2
+       bgt             STRMM_L4x16_SUB2
 
-.LSTRMM_L4x16_SAVE:
+STRMM_L4x16_SAVE:
 
        SAVE4x16
 
@@ -955,16 +954,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LSTRMM_L4x16_BEGIN
+       bgt             STRMM_L4x16_BEGIN
 
-.LSTRMM_L4x16_END:
+STRMM_L4x16_END:
 
-.LSTRMM_L4x8_BEGIN:
+STRMM_L4x8_BEGIN:
        andi.           T2,     M,      15
-       ble             .LSTRMM_L4x1_END
+       ble             STRMM_L4x1_END
 
        andi.           T1,     M,      8
-       ble             .LSTRMM_L4x8_END
+       ble             STRMM_L4x8_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -990,11 +989,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L4x8_SUB0
+       ble             STRMM_L4x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L4x8_SUB4
+       ble             STRMM_L4x8_SUB4
 
-.LSTRMM_L4x8_LOOP_START:
+STRMM_L4x8_LOOP_START:
 
        LOAD4x8_1
        KERNEL4x8_I1
@@ -1008,11 +1007,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L4x8_LOOP_END
+       ble             STRMM_L4x8_LOOP_END
 
        .align 5
 
-.LSTRMM_L4x8_LOOP:
+STRMM_L4x8_LOOP:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -1025,9 +1024,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x8_LOOP
+       bgt             STRMM_L4x8_LOOP
 
-.LSTRMM_L4x8_LOOP_END:
+STRMM_L4x8_LOOP_END:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -1039,9 +1038,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_1
        KERNEL4x8_E2
 
-       b               .LSTRMM_L4x8_SUB1
+       b               STRMM_L4x8_SUB1
 
-.LSTRMM_L4x8_SUB4:
+STRMM_L4x8_SUB4:
 
        KERNEL4x8_SUBI1
        KERNEL4x8_SUB1
@@ -1053,31 +1052,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_SUB1
        KERNEL4x8_SUB1
 
-       b               .LSTRMM_L4x8_SUB1
+       b               STRMM_L4x8_SUB1
 
-.LSTRMM_L4x8_SUB0:
+STRMM_L4x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L4x8_SAVE
-       b               .LSTRMM_L4x8_SUB2
+       ble             STRMM_L4x8_SAVE
+       b               STRMM_L4x8_SUB2
 
-.LSTRMM_L4x8_SUB1:
+STRMM_L4x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L4x8_SAVE
+       ble             STRMM_L4x8_SAVE
 
-.LSTRMM_L4x8_SUB2:
+STRMM_L4x8_SUB2:
 
        KERNEL4x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x8_SUB2
+       bgt             STRMM_L4x8_SUB2
 
-.LSTRMM_L4x8_SAVE:
+STRMM_L4x8_SAVE:
 
        SAVE4x8
 
@@ -1094,12 +1093,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L4x8_END:
+STRMM_L4x8_END:
 
-.LSTRMM_L4x4_BEGIN:
+STRMM_L4x4_BEGIN:
 
        andi.           T1,     M,      4
-       ble             .LSTRMM_L4x4_END
+       ble             STRMM_L4x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1125,11 +1124,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L4x4_SUB0
+       ble             STRMM_L4x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L4x4_SUB4
+       ble             STRMM_L4x4_SUB4
 
-.LSTRMM_L4x4_LOOP_START:
+STRMM_L4x4_LOOP_START:
 
        LOAD4x4_1
        KERNEL4x4_I1
@@ -1143,11 +1142,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L4x4_LOOP_END
+       ble             STRMM_L4x4_LOOP_END
 
        .align 5
 
-.LSTRMM_L4x4_LOOP:
+STRMM_L4x4_LOOP:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -1160,9 +1159,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x4_LOOP
+       bgt             STRMM_L4x4_LOOP
 
-.LSTRMM_L4x4_LOOP_END:
+STRMM_L4x4_LOOP_END:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -1174,9 +1173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_1
        KERNEL4x4_E2
 
-       b               .LSTRMM_L4x4_SUB1
+       b               STRMM_L4x4_SUB1
 
-.LSTRMM_L4x4_SUB4:
+STRMM_L4x4_SUB4:
 
        KERNEL4x4_SUBI1
        KERNEL4x4_SUB1
@@ -1188,31 +1187,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_SUB1
        KERNEL4x4_SUB1
 
-       b               .LSTRMM_L4x4_SUB1
+       b               STRMM_L4x4_SUB1
 
-.LSTRMM_L4x4_SUB0:
+STRMM_L4x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L4x4_SAVE
-       b               .LSTRMM_L4x4_SUB2
+       ble             STRMM_L4x4_SAVE
+       b               STRMM_L4x4_SUB2
 
-.LSTRMM_L4x4_SUB1:
+STRMM_L4x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L4x4_SAVE
+       ble             STRMM_L4x4_SAVE
 
-.LSTRMM_L4x4_SUB2:
+STRMM_L4x4_SUB2:
 
        KERNEL4x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x4_SUB2
+       bgt             STRMM_L4x4_SUB2
 
-.LSTRMM_L4x4_SAVE:
+STRMM_L4x4_SAVE:
 
        SAVE4x4
 
@@ -1229,12 +1228,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L4x4_END:
+STRMM_L4x4_END:
 
-.LSTRMM_L4x2_BEGIN:
+STRMM_L4x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LSTRMM_L4x2_END
+       ble             STRMM_L4x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1260,11 +1259,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L4x2_SUB0
+       ble             STRMM_L4x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L4x2_SUB4
+       ble             STRMM_L4x2_SUB4
 
-.LSTRMM_L4x2_LOOP_START:
+STRMM_L4x2_LOOP_START:
 
        LOAD4x2_1
        KERNEL4x2_I1
@@ -1278,11 +1277,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L4x2_LOOP_END
+       ble             STRMM_L4x2_LOOP_END
 
        .align 5
 
-.LSTRMM_L4x2_LOOP:
+STRMM_L4x2_LOOP:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -1295,9 +1294,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x2_LOOP
+       bgt             STRMM_L4x2_LOOP
 
-.LSTRMM_L4x2_LOOP_END:
+STRMM_L4x2_LOOP_END:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -1309,9 +1308,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_1
        KERNEL4x2_E2
 
-       b               .LSTRMM_L4x2_SUB1
+       b               STRMM_L4x2_SUB1
 
-.LSTRMM_L4x2_SUB4:
+STRMM_L4x2_SUB4:
 
        KERNEL4x2_SUBI1
        KERNEL4x2_SUB1
@@ -1323,31 +1322,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB1
        KERNEL4x2_SUB1
 
-       b               .LSTRMM_L4x2_SUB1
+       b               STRMM_L4x2_SUB1
 
-.LSTRMM_L4x2_SUB0:
+STRMM_L4x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L4x2_SAVE
-       b               .LSTRMM_L4x2_SUB2
+       ble             STRMM_L4x2_SAVE
+       b               STRMM_L4x2_SUB2
 
-.LSTRMM_L4x2_SUB1:
+STRMM_L4x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L4x2_SAVE
+       ble             STRMM_L4x2_SAVE
 
-.LSTRMM_L4x2_SUB2:
+STRMM_L4x2_SUB2:
 
        KERNEL4x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x2_SUB2
+       bgt             STRMM_L4x2_SUB2
 
-.LSTRMM_L4x2_SAVE:
+STRMM_L4x2_SAVE:
 
        SAVE4x2
 
@@ -1364,12 +1363,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L4x2_END:
+STRMM_L4x2_END:
 
-.LSTRMM_L4x1_BEGIN:
+STRMM_L4x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LSTRMM_L4x1_END
+       ble             STRMM_L4x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1395,11 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L4x1_SUB0
+       ble             STRMM_L4x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L4x1_SUB4
+       ble             STRMM_L4x1_SUB4
 
-.LSTRMM_L4x1_LOOP_START:
+STRMM_L4x1_LOOP_START:
 
        LOAD4x1_1
        KERNEL4x1_I1
@@ -1413,11 +1412,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L4x1_LOOP_END
+       ble             STRMM_L4x1_LOOP_END
 
        .align 5
 
-.LSTRMM_L4x1_LOOP:
+STRMM_L4x1_LOOP:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -1430,9 +1429,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x1_LOOP
+       bgt             STRMM_L4x1_LOOP
 
-.LSTRMM_L4x1_LOOP_END:
+STRMM_L4x1_LOOP_END:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -1444,9 +1443,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_1
        KERNEL4x1_E2
 
-       b               .LSTRMM_L4x1_SUB1
+       b               STRMM_L4x1_SUB1
 
-.LSTRMM_L4x1_SUB4:
+STRMM_L4x1_SUB4:
 
        KERNEL4x1_SUBI1
        KERNEL4x1_SUB1
@@ -1458,31 +1457,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_SUB1
        KERNEL4x1_SUB1
 
-       b               .LSTRMM_L4x1_SUB1
+       b               STRMM_L4x1_SUB1
 
-.LSTRMM_L4x1_SUB0:
+STRMM_L4x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L4x1_SAVE
-       b               .LSTRMM_L4x1_SUB2
+       ble             STRMM_L4x1_SAVE
+       b               STRMM_L4x1_SUB2
 
-.LSTRMM_L4x1_SUB1:
+STRMM_L4x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L4x1_SAVE
+       ble             STRMM_L4x1_SAVE
 
-.LSTRMM_L4x1_SUB2:
+STRMM_L4x1_SUB2:
 
        KERNEL4x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L4x1_SUB2
+       bgt             STRMM_L4x1_SUB2
 
-.LSTRMM_L4x1_SAVE:
+STRMM_L4x1_SAVE:
 
        SAVE4x1
 
@@ -1499,7 +1498,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L4x1_END:
+STRMM_L4x1_END:
 
        slwi            T1,     K,      4
        add             B,      B,      T1
@@ -1509,11 +1508,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L4_END:
-.LSTRMM_L2_BEGIN:
+STRMM_L4_END:
+STRMM_L2_BEGIN:
 
        andi.           T1,     N,      2
-       ble             .LSTRMM_L2_END
+       ble             STRMM_L2_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       1
@@ -1524,9 +1523,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      4
-       ble             .LSTRMM_L2x16_END
+       ble             STRMM_L2x16_END
 
-.LSTRMM_L2x16_BEGIN:
+STRMM_L2x16_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1553,11 +1552,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L2x16_SUB0
+       ble             STRMM_L2x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L2x16_SUB4
+       ble             STRMM_L2x16_SUB4
 
-.LSTRMM_L2x16_LOOP_START:
+STRMM_L2x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD2x16_1
@@ -1580,11 +1579,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L2x16_LOOP_END
+       ble             STRMM_L2x16_LOOP_END
 
        .align 5
 
-.LSTRMM_L2x16_LOOP:
+STRMM_L2x16_LOOP:
 
        dcbt            AO,     PRE
        KERNEL2x16_1
@@ -1605,9 +1604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x16_LOOP
+       bgt             STRMM_L2x16_LOOP
 
-.LSTRMM_L2x16_LOOP_END:
+STRMM_L2x16_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL2x16_1
@@ -1626,9 +1625,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_1
        KERNEL2x16_E2
 
-       b               .LSTRMM_L2x16_SUB1
+       b               STRMM_L2x16_SUB1
 
-.LSTRMM_L2x16_SUB4:
+STRMM_L2x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL2x16_SUBI1
@@ -1644,31 +1643,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x16_SUB1
        KERNEL2x16_SUB1
 
-       b               .LSTRMM_L2x16_SUB1
+       b               STRMM_L2x16_SUB1
 
-.LSTRMM_L2x16_SUB0:
+STRMM_L2x16_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L2x16_SAVE
-       b               .LSTRMM_L2x16_SUB2
+       ble             STRMM_L2x16_SAVE
+       b               STRMM_L2x16_SUB2
 
-.LSTRMM_L2x16_SUB1:
+STRMM_L2x16_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L2x16_SAVE
+       ble             STRMM_L2x16_SAVE
 
-.LSTRMM_L2x16_SUB2:
+STRMM_L2x16_SUB2:
 
        KERNEL2x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x16_SUB2
+       bgt             STRMM_L2x16_SUB2
 
-.LSTRMM_L2x16_SAVE:
+STRMM_L2x16_SAVE:
 
        SAVE2x16
 
@@ -1686,16 +1685,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LSTRMM_L2x16_BEGIN
+       bgt             STRMM_L2x16_BEGIN
 
-.LSTRMM_L2x16_END:
+STRMM_L2x16_END:
 
-.LSTRMM_L2x8_BEGIN:
+STRMM_L2x8_BEGIN:
        andi.           T2,     M,      15
-       ble             .LSTRMM_L2x1_END
+       ble             STRMM_L2x1_END
 
        andi.           T1,     M,      8
-       ble             .LSTRMM_L2x8_END
+       ble             STRMM_L2x8_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1721,11 +1720,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L2x8_SUB0
+       ble             STRMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L2x8_SUB4
+       ble             STRMM_L2x8_SUB4
 
-.LSTRMM_L2x8_LOOP_START:
+STRMM_L2x8_LOOP_START:
 
        LOAD2x8_1
        KERNEL2x8_I1
@@ -1739,11 +1738,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L2x8_LOOP_END
+       ble             STRMM_L2x8_LOOP_END
 
        .align 5
 
-.LSTRMM_L2x8_LOOP:
+STRMM_L2x8_LOOP:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -1756,9 +1755,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x8_LOOP
+       bgt             STRMM_L2x8_LOOP
 
-.LSTRMM_L2x8_LOOP_END:
+STRMM_L2x8_LOOP_END:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -1770,9 +1769,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_1
        KERNEL2x8_E2
 
-       b               .LSTRMM_L2x8_SUB1
+       b               STRMM_L2x8_SUB1
 
-.LSTRMM_L2x8_SUB4:
+STRMM_L2x8_SUB4:
 
        KERNEL2x8_SUBI1
        KERNEL2x8_SUB1
@@ -1784,31 +1783,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_SUB1
        KERNEL2x8_SUB1
 
-       b               .LSTRMM_L2x8_SUB1
+       b               STRMM_L2x8_SUB1
 
-.LSTRMM_L2x8_SUB0:
+STRMM_L2x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L2x8_SAVE
-       b               .LSTRMM_L2x8_SUB2
+       ble             STRMM_L2x8_SAVE
+       b               STRMM_L2x8_SUB2
 
-.LSTRMM_L2x8_SUB1:
+STRMM_L2x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L2x8_SAVE
+       ble             STRMM_L2x8_SAVE
 
-.LSTRMM_L2x8_SUB2:
+STRMM_L2x8_SUB2:
 
        KERNEL2x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x8_SUB2
+       bgt             STRMM_L2x8_SUB2
 
-.LSTRMM_L2x8_SAVE:
+STRMM_L2x8_SAVE:
 
        SAVE2x8
 
@@ -1825,12 +1824,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L2x8_END:
+STRMM_L2x8_END:
 
-.LSTRMM_L2x4_BEGIN:
+STRMM_L2x4_BEGIN:
 
        andi.           T1,     M,      4
-       ble             .LSTRMM_L2x4_END
+       ble             STRMM_L2x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1856,11 +1855,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L2x4_SUB0
+       ble             STRMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L2x4_SUB4
+       ble             STRMM_L2x4_SUB4
 
-.LSTRMM_L2x4_LOOP_START:
+STRMM_L2x4_LOOP_START:
 
        LOAD2x4_1
        KERNEL2x4_I1
@@ -1874,11 +1873,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L2x4_LOOP_END
+       ble             STRMM_L2x4_LOOP_END
 
        .align 5
 
-.LSTRMM_L2x4_LOOP:
+STRMM_L2x4_LOOP:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -1891,9 +1890,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x4_LOOP
+       bgt             STRMM_L2x4_LOOP
 
-.LSTRMM_L2x4_LOOP_END:
+STRMM_L2x4_LOOP_END:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -1905,9 +1904,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_1
        KERNEL2x4_E2
 
-       b               .LSTRMM_L2x4_SUB1
+       b               STRMM_L2x4_SUB1
 
-.LSTRMM_L2x4_SUB4:
+STRMM_L2x4_SUB4:
 
        KERNEL2x4_SUBI1
        KERNEL2x4_SUB1
@@ -1919,31 +1918,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
 
-       b               .LSTRMM_L2x4_SUB1
+       b               STRMM_L2x4_SUB1
 
-.LSTRMM_L2x4_SUB0:
+STRMM_L2x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L2x4_SAVE
-       b               .LSTRMM_L2x4_SUB2
+       ble             STRMM_L2x4_SAVE
+       b               STRMM_L2x4_SUB2
 
-.LSTRMM_L2x4_SUB1:
+STRMM_L2x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L2x4_SAVE
+       ble             STRMM_L2x4_SAVE
 
-.LSTRMM_L2x4_SUB2:
+STRMM_L2x4_SUB2:
 
        KERNEL2x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x4_SUB2
+       bgt             STRMM_L2x4_SUB2
 
-.LSTRMM_L2x4_SAVE:
+STRMM_L2x4_SAVE:
 
        SAVE2x4
 
@@ -1960,12 +1959,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L2x4_END:
+STRMM_L2x4_END:
 
-.LSTRMM_L2x2_BEGIN:
+STRMM_L2x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LSTRMM_L2x2_END
+       ble             STRMM_L2x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1991,11 +1990,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L2x2_SUB0
+       ble             STRMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L2x2_SUB4
+       ble             STRMM_L2x2_SUB4
 
-.LSTRMM_L2x2_LOOP_START:
+STRMM_L2x2_LOOP_START:
 
        LOAD2x2_1
        KERNEL2x2_I1
@@ -2009,11 +2008,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L2x2_LOOP_END
+       ble             STRMM_L2x2_LOOP_END
 
        .align 5
 
-.LSTRMM_L2x2_LOOP:
+STRMM_L2x2_LOOP:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -2026,9 +2025,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x2_LOOP
+       bgt             STRMM_L2x2_LOOP
 
-.LSTRMM_L2x2_LOOP_END:
+STRMM_L2x2_LOOP_END:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -2040,9 +2039,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_1
        KERNEL2x2_E2
 
-       b               .LSTRMM_L2x2_SUB1
+       b               STRMM_L2x2_SUB1
 
-.LSTRMM_L2x2_SUB4:
+STRMM_L2x2_SUB4:
 
        KERNEL2x2_SUBI1
        KERNEL2x2_SUB1
@@ -2054,31 +2053,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
 
-       b               .LSTRMM_L2x2_SUB1
+       b               STRMM_L2x2_SUB1
 
-.LSTRMM_L2x2_SUB0:
+STRMM_L2x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L2x2_SAVE
-       b               .LSTRMM_L2x2_SUB2
+       ble             STRMM_L2x2_SAVE
+       b               STRMM_L2x2_SUB2
 
-.LSTRMM_L2x2_SUB1:
+STRMM_L2x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L2x2_SAVE
+       ble             STRMM_L2x2_SAVE
 
-.LSTRMM_L2x2_SUB2:
+STRMM_L2x2_SUB2:
 
        KERNEL2x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x2_SUB2
+       bgt             STRMM_L2x2_SUB2
 
-.LSTRMM_L2x2_SAVE:
+STRMM_L2x2_SAVE:
 
        SAVE2x2
 
@@ -2095,12 +2094,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L2x2_END:
+STRMM_L2x2_END:
 
-.LSTRMM_L2x1_BEGIN:
+STRMM_L2x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LSTRMM_L2x1_END
+       ble             STRMM_L2x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -2126,11 +2125,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L2x1_SUB0
+       ble             STRMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L2x1_SUB4
+       ble             STRMM_L2x1_SUB4
 
-.LSTRMM_L2x1_LOOP_START:
+STRMM_L2x1_LOOP_START:
 
        LOAD2x1_1
        KERNEL2x1_I1
@@ -2144,11 +2143,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L2x1_LOOP_END
+       ble             STRMM_L2x1_LOOP_END
 
        .align 5
 
-.LSTRMM_L2x1_LOOP:
+STRMM_L2x1_LOOP:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -2161,9 +2160,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x1_LOOP
+       bgt             STRMM_L2x1_LOOP
 
-.LSTRMM_L2x1_LOOP_END:
+STRMM_L2x1_LOOP_END:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -2175,9 +2174,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_1
        KERNEL2x1_E2
 
-       b               .LSTRMM_L2x1_SUB1
+       b               STRMM_L2x1_SUB1
 
-.LSTRMM_L2x1_SUB4:
+STRMM_L2x1_SUB4:
 
        KERNEL2x1_SUBI1
        KERNEL2x1_SUB1
@@ -2189,31 +2188,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
 
-       b               .LSTRMM_L2x1_SUB1
+       b               STRMM_L2x1_SUB1
 
-.LSTRMM_L2x1_SUB0:
+STRMM_L2x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L2x1_SAVE
-       b               .LSTRMM_L2x1_SUB2
+       ble             STRMM_L2x1_SAVE
+       b               STRMM_L2x1_SUB2
 
-.LSTRMM_L2x1_SUB1:
+STRMM_L2x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L2x1_SAVE
+       ble             STRMM_L2x1_SAVE
 
-.LSTRMM_L2x1_SUB2:
+STRMM_L2x1_SUB2:
 
        KERNEL2x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L2x1_SUB2
+       bgt             STRMM_L2x1_SUB2
 
-.LSTRMM_L2x1_SAVE:
+STRMM_L2x1_SAVE:
 
        SAVE2x1
 
@@ -2230,7 +2229,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L2x1_END:
+STRMM_L2x1_END:
 
        slwi            T1,     K,      3
        add             B,      B,      T1
@@ -2240,11 +2239,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L2_END:
-.LSTRMM_L1_BEGIN:
+STRMM_L2_END:
+STRMM_L1_BEGIN:
 
        andi.           T1,     N,      1
-       ble             .LSTRMM_L1_END
+       ble             STRMM_L1_END
        mr              CO,     C
        mr              AO,     A
 
@@ -2253,9 +2252,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      4
-       ble             .LSTRMM_L1x16_END
+       ble             STRMM_L1x16_END
 
-.LSTRMM_L1x16_BEGIN:
+STRMM_L1x16_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -2282,11 +2281,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L1x16_SUB0
+       ble             STRMM_L1x16_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L1x16_SUB4
+       ble             STRMM_L1x16_SUB4
 
-.LSTRMM_L1x16_LOOP_START:
+STRMM_L1x16_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD1x16_1
@@ -2309,11 +2308,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L1x16_LOOP_END
+       ble             STRMM_L1x16_LOOP_END
 
        .align 5
 
-.LSTRMM_L1x16_LOOP:
+STRMM_L1x16_LOOP:
 
        dcbt            AO,     PRE
        KERNEL1x16_1
@@ -2334,9 +2333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x16_LOOP
+       bgt             STRMM_L1x16_LOOP
 
-.LSTRMM_L1x16_LOOP_END:
+STRMM_L1x16_LOOP_END:
 
        dcbt            AO,     PRE
        KERNEL1x16_1
@@ -2355,9 +2354,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_1
        KERNEL1x16_E2
 
-       b               .LSTRMM_L1x16_SUB1
+       b               STRMM_L1x16_SUB1
 
-.LSTRMM_L1x16_SUB4:
+STRMM_L1x16_SUB4:
 
        dcbt            AO,     PRE
        KERNEL1x16_SUBI1
@@ -2373,31 +2372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x16_SUB1
        KERNEL1x16_SUB1
 
-       b               .LSTRMM_L1x16_SUB1
+       b               STRMM_L1x16_SUB1
 
-.LSTRMM_L1x16_SUB0:
+STRMM_L1x16_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x16_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L1x16_SAVE
-       b               .LSTRMM_L1x16_SUB2
+       ble             STRMM_L1x16_SAVE
+       b               STRMM_L1x16_SUB2
 
-.LSTRMM_L1x16_SUB1:
+STRMM_L1x16_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L1x16_SAVE
+       ble             STRMM_L1x16_SAVE
 
-.LSTRMM_L1x16_SUB2:
+STRMM_L1x16_SUB2:
 
        KERNEL1x16_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x16_SUB2
+       bgt             STRMM_L1x16_SUB2
 
-.LSTRMM_L1x16_SAVE:
+STRMM_L1x16_SAVE:
 
        SAVE1x16
 
@@ -2415,16 +2414,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LSTRMM_L1x16_BEGIN
+       bgt             STRMM_L1x16_BEGIN
 
-.LSTRMM_L1x16_END:
+STRMM_L1x16_END:
 
-.LSTRMM_L1x8_BEGIN:
+STRMM_L1x8_BEGIN:
        andi.           T2,     M,      15
-       ble             .LSTRMM_L1x1_END
+       ble             STRMM_L1x1_END
 
        andi.           T1,     M,      8
-       ble             .LSTRMM_L1x8_END
+       ble             STRMM_L1x8_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -2450,11 +2449,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L1x8_SUB0
+       ble             STRMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L1x8_SUB4
+       ble             STRMM_L1x8_SUB4
 
-.LSTRMM_L1x8_LOOP_START:
+STRMM_L1x8_LOOP_START:
 
        LOAD1x8_1
        KERNEL1x8_I1
@@ -2468,11 +2467,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L1x8_LOOP_END
+       ble             STRMM_L1x8_LOOP_END
 
        .align 5
 
-.LSTRMM_L1x8_LOOP:
+STRMM_L1x8_LOOP:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -2485,9 +2484,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x8_LOOP
+       bgt             STRMM_L1x8_LOOP
 
-.LSTRMM_L1x8_LOOP_END:
+STRMM_L1x8_LOOP_END:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -2499,9 +2498,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_1
        KERNEL1x8_E2
 
-       b               .LSTRMM_L1x8_SUB1
+       b               STRMM_L1x8_SUB1
 
-.LSTRMM_L1x8_SUB4:
+STRMM_L1x8_SUB4:
 
        KERNEL1x8_SUBI1
        KERNEL1x8_SUB1
@@ -2513,31 +2512,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_SUB1
        KERNEL1x8_SUB1
 
-       b               .LSTRMM_L1x8_SUB1
+       b               STRMM_L1x8_SUB1
 
-.LSTRMM_L1x8_SUB0:
+STRMM_L1x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L1x8_SAVE
-       b               .LSTRMM_L1x8_SUB2
+       ble             STRMM_L1x8_SAVE
+       b               STRMM_L1x8_SUB2
 
-.LSTRMM_L1x8_SUB1:
+STRMM_L1x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L1x8_SAVE
+       ble             STRMM_L1x8_SAVE
 
-.LSTRMM_L1x8_SUB2:
+STRMM_L1x8_SUB2:
 
        KERNEL1x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x8_SUB2
+       bgt             STRMM_L1x8_SUB2
 
-.LSTRMM_L1x8_SAVE:
+STRMM_L1x8_SAVE:
 
        SAVE1x8
 
@@ -2554,12 +2553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L1x8_END:
+STRMM_L1x8_END:
 
-.LSTRMM_L1x4_BEGIN:
+STRMM_L1x4_BEGIN:
 
        andi.           T1,     M,      4
-       ble             .LSTRMM_L1x4_END
+       ble             STRMM_L1x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -2585,11 +2584,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L1x4_SUB0
+       ble             STRMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L1x4_SUB4
+       ble             STRMM_L1x4_SUB4
 
-.LSTRMM_L1x4_LOOP_START:
+STRMM_L1x4_LOOP_START:
 
        LOAD1x4_1
        KERNEL1x4_I1
@@ -2603,11 +2602,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L1x4_LOOP_END
+       ble             STRMM_L1x4_LOOP_END
 
        .align 5
 
-.LSTRMM_L1x4_LOOP:
+STRMM_L1x4_LOOP:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -2620,9 +2619,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x4_LOOP
+       bgt             STRMM_L1x4_LOOP
 
-.LSTRMM_L1x4_LOOP_END:
+STRMM_L1x4_LOOP_END:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -2634,9 +2633,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_1
        KERNEL1x4_E2
 
-       b               .LSTRMM_L1x4_SUB1
+       b               STRMM_L1x4_SUB1
 
-.LSTRMM_L1x4_SUB4:
+STRMM_L1x4_SUB4:
 
        KERNEL1x4_SUBI1
        KERNEL1x4_SUB1
@@ -2648,31 +2647,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
 
-       b               .LSTRMM_L1x4_SUB1
+       b               STRMM_L1x4_SUB1
 
-.LSTRMM_L1x4_SUB0:
+STRMM_L1x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L1x4_SAVE
-       b               .LSTRMM_L1x4_SUB2
+       ble             STRMM_L1x4_SAVE
+       b               STRMM_L1x4_SUB2
 
-.LSTRMM_L1x4_SUB1:
+STRMM_L1x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L1x4_SAVE
+       ble             STRMM_L1x4_SAVE
 
-.LSTRMM_L1x4_SUB2:
+STRMM_L1x4_SUB2:
 
        KERNEL1x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x4_SUB2
+       bgt             STRMM_L1x4_SUB2
 
-.LSTRMM_L1x4_SAVE:
+STRMM_L1x4_SAVE:
 
        SAVE1x4
 
@@ -2689,12 +2688,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L1x4_END:
+STRMM_L1x4_END:
 
-.LSTRMM_L1x2_BEGIN:
+STRMM_L1x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LSTRMM_L1x2_END
+       ble             STRMM_L1x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -2720,11 +2719,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L1x2_SUB0
+       ble             STRMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L1x2_SUB4
+       ble             STRMM_L1x2_SUB4
 
-.LSTRMM_L1x2_LOOP_START:
+STRMM_L1x2_LOOP_START:
 
        LOAD1x2_1
        KERNEL1x2_I1
@@ -2738,11 +2737,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L1x2_LOOP_END
+       ble             STRMM_L1x2_LOOP_END
 
        .align 5
 
-.LSTRMM_L1x2_LOOP:
+STRMM_L1x2_LOOP:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -2755,9 +2754,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x2_LOOP
+       bgt             STRMM_L1x2_LOOP
 
-.LSTRMM_L1x2_LOOP_END:
+STRMM_L1x2_LOOP_END:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -2769,9 +2768,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_1
        KERNEL1x2_E2
 
-       b               .LSTRMM_L1x2_SUB1
+       b               STRMM_L1x2_SUB1
 
-.LSTRMM_L1x2_SUB4:
+STRMM_L1x2_SUB4:
 
        KERNEL1x2_SUBI1
        KERNEL1x2_SUB1
@@ -2783,31 +2782,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
 
-       b               .LSTRMM_L1x2_SUB1
+       b               STRMM_L1x2_SUB1
 
-.LSTRMM_L1x2_SUB0:
+STRMM_L1x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L1x2_SAVE
-       b               .LSTRMM_L1x2_SUB2
+       ble             STRMM_L1x2_SAVE
+       b               STRMM_L1x2_SUB2
 
-.LSTRMM_L1x2_SUB1:
+STRMM_L1x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L1x2_SAVE
+       ble             STRMM_L1x2_SAVE
 
-.LSTRMM_L1x2_SUB2:
+STRMM_L1x2_SUB2:
 
        KERNEL1x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x2_SUB2
+       bgt             STRMM_L1x2_SUB2
 
-.LSTRMM_L1x2_SAVE:
+STRMM_L1x2_SAVE:
 
        SAVE1x2
 
@@ -2824,12 +2823,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L1x2_END:
+STRMM_L1x2_END:
 
-.LSTRMM_L1x1_BEGIN:
+STRMM_L1x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LSTRMM_L1x1_END
+       ble             STRMM_L1x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -2855,11 +2854,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LSTRMM_L1x1_SUB0
+       ble             STRMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LSTRMM_L1x1_SUB4
+       ble             STRMM_L1x1_SUB4
 
-.LSTRMM_L1x1_LOOP_START:
+STRMM_L1x1_LOOP_START:
 
        LOAD1x1_1
        KERNEL1x1_I1
@@ -2873,11 +2872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -2
-       ble             .LSTRMM_L1x1_LOOP_END
+       ble             STRMM_L1x1_LOOP_END
 
        .align 5
 
-.LSTRMM_L1x1_LOOP:
+STRMM_L1x1_LOOP:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -2890,9 +2889,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x1_LOOP
+       bgt             STRMM_L1x1_LOOP
 
-.LSTRMM_L1x1_LOOP_END:
+STRMM_L1x1_LOOP_END:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -2904,9 +2903,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_1
        KERNEL1x1_E2
 
-       b               .LSTRMM_L1x1_SUB1
+       b               STRMM_L1x1_SUB1
 
-.LSTRMM_L1x1_SUB4:
+STRMM_L1x1_SUB4:
 
        KERNEL1x1_SUBI1
        KERNEL1x1_SUB1
@@ -2918,31 +2917,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
 
-       b               .LSTRMM_L1x1_SUB1
+       b               STRMM_L1x1_SUB1
 
-.LSTRMM_L1x1_SUB0:
+STRMM_L1x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LSTRMM_L1x1_SAVE
-       b               .LSTRMM_L1x1_SUB2
+       ble             STRMM_L1x1_SAVE
+       b               STRMM_L1x1_SUB2
 
-.LSTRMM_L1x1_SUB1:
+STRMM_L1x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LSTRMM_L1x1_SAVE
+       ble             STRMM_L1x1_SAVE
 
-.LSTRMM_L1x1_SUB2:
+STRMM_L1x1_SUB2:
 
        KERNEL1x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LSTRMM_L1x1_SUB2
+       bgt             STRMM_L1x1_SUB2
 
-.LSTRMM_L1x1_SAVE:
+STRMM_L1x1_SAVE:
 
        SAVE1x1
 
@@ -2959,11 +2958,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LSTRMM_L1x1_END:
+STRMM_L1x1_END:
 
 #if !defined(LEFT)
        addi            KK,     KK,     1                                       // KK += Number of values in B
 #endif
 
 
-.LSTRMM_L1_END:
+STRMM_L1_END:
diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S
new file mode 100644 (file)
index 0000000..27bc1e8
--- /dev/null
@@ -0,0 +1,5840 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+*       BLASTEST               : OK
+*       CTEST                  : OK
+*       TEST                   : OK
+*       LAPACK-TEST            : OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+       xvmulsp         vs48,   vs0,    vs12
+       xvmulsp         vs49,   vs1,    vs12
+       xvmulsp         vs50,   vs2,    vs12
+       xvmulsp         vs51,   vs3,    vs12
+
+       xvmulsp         vs52,   vs0,    vs13
+       xvmulsp         vs53,   vs1,    vs13
+       xvmulsp         vs54,   vs2,    vs13
+       xvmulsp         vs55,   vs3,    vs13
+
+       xvmulsp         vs56,   vs0,    vs14
+       xvmulsp         vs57,   vs1,    vs14
+       xvmulsp         vs58,   vs2,    vs14
+       xvmulsp         vs59,   vs3,    vs14
+
+       xvmulsp         vs60,   vs0,    vs15
+       xvmulsp         vs61,   vs1,    vs15
+       xvmulsp         vs62,   vs2,    vs15
+       xvmulsp         vs63,   vs3,    vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+       xvmaddasp       vs48,   vs0,    vs12
+       xvmaddasp       vs49,   vs1,    vs12
+       xvmaddasp       vs50,   vs2,    vs12
+       xvmaddasp       vs51,   vs3,    vs12
+
+       xvmaddasp       vs52,   vs0,    vs13
+       xvmaddasp       vs53,   vs1,    vs13
+       xvmaddasp       vs54,   vs2,    vs13
+       xvmaddasp       vs55,   vs3,    vs13
+
+       xvmaddasp       vs56,   vs0,    vs14
+       xvmaddasp       vs57,   vs1,    vs14
+       xvmaddasp       vs58,   vs2,    vs14
+       xvmaddasp       vs59,   vs3,    vs14
+
+       xvmaddasp       vs60,   vs0,    vs15
+       xvmaddasp       vs61,   vs1,    vs15
+       xvmaddasp       vs62,   vs2,    vs15
+       xvmaddasp       vs63,   vs3,    vs15
+
+
+.endm
+
+.macro KERNEL8x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+       xvmaddasp       vs48,   vs4,    vs20
+       xvmaddasp       vs49,   vs5,    vs20
+       xvmaddasp       vs50,   vs6,    vs20
+       xvmaddasp       vs51,   vs7,    vs20
+
+       xvmaddasp       vs52,   vs4,    vs21
+       xvmaddasp       vs53,   vs5,    vs21
+       xvmaddasp       vs54,   vs6,    vs21
+       xvmaddasp       vs55,   vs7,    vs21
+
+       xvmaddasp       vs56,   vs4,    vs22
+       xvmaddasp       vs57,   vs5,    vs22
+       xvmaddasp       vs58,   vs6,    vs22
+       xvmaddasp       vs59,   vs7,    vs22
+
+       xvmaddasp       vs60,   vs4,    vs23
+       xvmaddasp       vs61,   vs5,    vs23
+       xvmaddasp       vs62,   vs6,    vs23
+       xvmaddasp       vs63,   vs7,    vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+       xvmaddasp       vs48,   vs4,    vs20
+       xvmaddasp       vs49,   vs5,    vs20
+       xvmaddasp       vs50,   vs6,    vs20
+       xvmaddasp       vs51,   vs7,    vs20
+
+       xvmaddasp       vs52,   vs4,    vs21
+       xvmaddasp       vs53,   vs5,    vs21
+       xvmaddasp       vs54,   vs6,    vs21
+       xvmaddasp       vs55,   vs7,    vs21
+
+       xvmaddasp       vs56,   vs4,    vs22
+       xvmaddasp       vs57,   vs5,    vs22
+       xvmaddasp       vs58,   vs6,    vs22
+       xvmaddasp       vs59,   vs7,    vs22
+
+       xvmaddasp       vs60,   vs4,    vs23
+       xvmaddasp       vs61,   vs5,    vs23
+       xvmaddasp       vs62,   vs6,    vs23
+       xvmaddasp       vs63,   vs7,    vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+       xvmulsp         vs48,   vs0,    vs12
+       xvmulsp         vs49,   vs1,    vs12
+       xvmulsp         vs50,   vs2,    vs12
+       xvmulsp         vs51,   vs3,    vs12
+
+       xvmulsp         vs52,   vs0,    vs13
+       xvmulsp         vs53,   vs1,    vs13
+       xvmulsp         vs54,   vs2,    vs13
+       xvmulsp         vs55,   vs3,    vs13
+
+       xvmulsp         vs56,   vs0,    vs14
+       xvmulsp         vs57,   vs1,    vs14
+       xvmulsp         vs58,   vs2,    vs14
+       xvmulsp         vs59,   vs3,    vs14
+
+       xvmulsp         vs60,   vs0,    vs15
+       xvmulsp         vs61,   vs1,    vs15
+       xvmulsp         vs62,   vs2,    vs15
+       xvmulsp         vs63,   vs3,    vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+       xvmaddasp       vs48,   vs0,    vs12
+       xvmaddasp       vs49,   vs1,    vs12
+       xvmaddasp       vs50,   vs2,    vs12
+       xvmaddasp       vs51,   vs3,    vs12
+
+       xvmaddasp       vs52,   vs0,    vs13
+       xvmaddasp       vs53,   vs1,    vs13
+       xvmaddasp       vs54,   vs2,    vs13
+       xvmaddasp       vs55,   vs3,    vs13
+
+       xvmaddasp       vs56,   vs0,    vs14
+       xvmaddasp       vs57,   vs1,    vs14
+       xvmaddasp       vs58,   vs2,    vs14
+       xvmaddasp       vs59,   vs3,    vs14
+
+       xvmaddasp       vs60,   vs0,    vs15
+       xvmaddasp       vs61,   vs1,    vs15
+       xvmaddasp       vs62,   vs2,    vs15
+       xvmaddasp       vs63,   vs3,    vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+       xvmulsp         vs2,    vs42,   alpha_vr
+       xvmulsp         vs3,    vs43,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+       xvmaddasp       vs2,    vs42,   alpha_vr
+       xvmaddasp       vs3,    vs43,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+       xvmulsp         vs2,    vs46,   alpha_vr
+       xvmulsp         vs3,    vs47,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+       xvmaddasp       vs2,    vs46,   alpha_vr
+       xvmaddasp       vs3,    vs47,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs48,   alpha_vr
+       xvmulsp         vs1,    vs49,   alpha_vr
+       xvmulsp         vs2,    vs50,   alpha_vr
+       xvmulsp         vs3,    vs51,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs48,   alpha_vr
+       xvmaddasp       vs1,    vs49,   alpha_vr
+       xvmaddasp       vs2,    vs50,   alpha_vr
+       xvmaddasp       vs3,    vs51,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs52,   alpha_vr
+       xvmulsp         vs1,    vs53,   alpha_vr
+       xvmulsp         vs2,    vs54,   alpha_vr
+       xvmulsp         vs3,    vs55,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs52,   alpha_vr
+       xvmaddasp       vs1,    vs53,   alpha_vr
+       xvmaddasp       vs2,    vs54,   alpha_vr
+       xvmaddasp       vs3,    vs55,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs56,   alpha_vr
+       xvmulsp         vs1,    vs57,   alpha_vr
+       xvmulsp         vs2,    vs58,   alpha_vr
+       xvmulsp         vs3,    vs59,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs56,   alpha_vr
+       xvmaddasp       vs1,    vs57,   alpha_vr
+       xvmaddasp       vs2,    vs58,   alpha_vr
+       xvmaddasp       vs3,    vs59,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs60,   alpha_vr
+       xvmulsp         vs1,    vs61,   alpha_vr
+       xvmulsp         vs2,    vs62,   alpha_vr
+       xvmulsp         vs3,    vs63,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs60,   alpha_vr
+       xvmaddasp       vs1,    vs61,   alpha_vr
+       xvmaddasp       vs2,    vs62,   alpha_vr
+       xvmaddasp       vs3,    vs63,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+       xvmulsp         vs40,   vs0,    vs12
+       xvmulsp         vs41,   vs1,    vs12
+
+       xvmulsp         vs42,   vs0,    vs13
+       xvmulsp         vs43,   vs1,    vs13
+
+       xvmulsp         vs44,   vs0,    vs14
+       xvmulsp         vs45,   vs1,    vs14
+
+       xvmulsp         vs46,   vs0,    vs15
+       xvmulsp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+       xvmaddasp       vs40,   vs0,    vs12
+       xvmaddasp       vs41,   vs1,    vs12
+
+       xvmaddasp       vs42,   vs0,    vs13
+       xvmaddasp       vs43,   vs1,    vs13
+
+       xvmaddasp       vs44,   vs0,    vs14
+       xvmaddasp       vs45,   vs1,    vs14
+
+       xvmaddasp       vs46,   vs0,    vs15
+       xvmaddasp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+       xvmaddasp       vs40,   vs4,    vs20
+       xvmaddasp       vs41,   vs5,    vs20
+
+       xvmaddasp       vs42,   vs4,    vs21
+       xvmaddasp       vs43,   vs5,    vs21
+
+       xvmaddasp       vs44,   vs4,    vs22
+       xvmaddasp       vs45,   vs5,    vs22
+
+       xvmaddasp       vs46,   vs4,    vs23
+       xvmaddasp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+       xvmaddasp       vs40,   vs4,    vs20
+       xvmaddasp       vs41,   vs5,    vs20
+
+       xvmaddasp       vs42,   vs4,    vs21
+       xvmaddasp       vs43,   vs5,    vs21
+
+       xvmaddasp       vs44,   vs4,    vs22
+       xvmaddasp       vs45,   vs5,    vs22
+
+       xvmaddasp       vs46,   vs4,    vs23
+       xvmaddasp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+       xvmulsp         vs40,   vs0,    vs12
+       xvmulsp         vs41,   vs1,    vs12
+
+       xvmulsp         vs42,   vs0,    vs13
+       xvmulsp         vs43,   vs1,    vs13
+
+       xvmulsp         vs44,   vs0,    vs14
+       xvmulsp         vs45,   vs1,    vs14
+
+       xvmulsp         vs46,   vs0,    vs15
+       xvmulsp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+       xvmaddasp       vs40,   vs0,    vs12
+       xvmaddasp       vs41,   vs1,    vs12
+
+       xvmaddasp       vs42,   vs0,    vs13
+       xvmaddasp       vs43,   vs1,    vs13
+
+       xvmaddasp       vs44,   vs0,    vs14
+       xvmaddasp       vs45,   vs1,    vs14
+
+       xvmaddasp       vs46,   vs0,    vs15
+       xvmaddasp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs38,   alpha_vr
+       xvmulsp         vs1,    vs39,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs38,   alpha_vr
+       xvmaddasp       vs1,    vs39,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs42,   alpha_vr
+       xvmulsp         vs1,    vs43,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs42,   alpha_vr
+       xvmaddasp       vs1,    vs43,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs46,   alpha_vr
+       xvmulsp         vs1,    vs47,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs46,   alpha_vr
+       xvmaddasp       vs1,    vs47,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+       xvmulsp         vs36,   vs0,    vs12
+
+       xvmulsp         vs37,   vs0,    vs13
+
+       xvmulsp         vs38,   vs0,    vs14
+
+       xvmulsp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs20,   vs29,   0
+       xxspltw         vs21,   vs29,   1
+       xxspltw         vs22,   vs29,   2
+       xxspltw         vs23,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+       xvmaddasp       vs36,   vs0,    vs12
+
+       xvmaddasp       vs37,   vs0,    vs13
+
+       xvmaddasp       vs38,   vs0,    vs14
+
+       xvmaddasp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+       xvmaddasp       vs36,   vs4,    vs20
+
+       xvmaddasp       vs37,   vs4,    vs21
+
+       xvmaddasp       vs38,   vs4,    vs22
+
+       xvmaddasp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+       xvmaddasp       vs36,   vs4,    vs20
+
+       xvmaddasp       vs37,   vs4,    vs21
+
+       xvmaddasp       vs38,   vs4,    vs22
+
+       xvmaddasp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+       xvmulsp         vs36,   vs0,    vs12
+
+       xvmulsp         vs37,   vs0,    vs13
+
+       xvmulsp         vs38,   vs0,    vs14
+
+       xvmulsp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       lxvw4x          vs29,   o16,    BO
+
+       xxspltw         vs12,   vs29,   0
+       xxspltw         vs13,   vs29,   1
+       xxspltw         vs14,   vs29,   2
+       xxspltw         vs15,   vs29,   3
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+       xvmaddasp       vs36,   vs0,    vs12
+
+       xvmaddasp       vs37,   vs0,    vs13
+
+       xvmaddasp       vs38,   vs0,    vs14
+
+       xvmaddasp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs34,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs34,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs36,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs37,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs37,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs38,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs38,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs39,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs39,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
+
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
+
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
+
+       xsmuldp         vs40,   vs0,    vs12
+       xsmuldp         vs41,   vs1,    vs12
+
+       xsmuldp         vs42,   vs0,    vs13
+       xsmuldp         vs43,   vs1,    vs13
+
+       xsmuldp         vs44,   vs0,    vs14
+       xsmuldp         vs45,   vs1,    vs14
+
+       xsmuldp         vs46,   vs0,    vs15
+       xsmuldp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
+
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
+
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
+
+       xsmaddadp       vs40,   vs0,    vs12
+       xsmaddadp       vs41,   vs1,    vs12
+
+       xsmaddadp       vs42,   vs0,    vs13
+       xsmaddadp       vs43,   vs1,    vs13
+
+       xsmaddadp       vs44,   vs0,    vs14
+       xsmaddadp       vs45,   vs1,    vs14
+
+       xsmaddadp       vs46,   vs0,    vs15
+       xsmaddadp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
+
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
+
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
+
+       xsmaddadp       vs40,   vs4,    vs20
+       xsmaddadp       vs41,   vs5,    vs20
+
+       xsmaddadp       vs42,   vs4,    vs21
+       xsmaddadp       vs43,   vs5,    vs21
+
+       xsmaddadp       vs44,   vs4,    vs22
+       xsmaddadp       vs45,   vs5,    vs22
+
+       xsmaddadp       vs46,   vs4,    vs23
+       xsmaddadp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
+
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
+
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
+
+       xsmaddadp       vs40,   vs4,    vs20
+       xsmaddadp       vs41,   vs5,    vs20
+
+       xsmaddadp       vs42,   vs4,    vs21
+       xsmaddadp       vs43,   vs5,    vs21
+
+       xsmaddadp       vs44,   vs4,    vs22
+       xsmaddadp       vs45,   vs5,    vs22
+
+       xsmaddadp       vs46,   vs4,    vs23
+       xsmaddadp       vs47,   vs5,    vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
+
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
+
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
+
+       xsmuldp         vs40,   vs0,    vs12
+       xsmuldp         vs41,   vs1,    vs12
+
+       xsmuldp         vs42,   vs0,    vs13
+       xsmuldp         vs43,   vs1,    vs13
+
+       xsmuldp         vs44,   vs0,    vs14
+       xsmuldp         vs45,   vs1,    vs14
+
+       xsmuldp         vs46,   vs0,    vs15
+       xsmuldp         vs47,   vs1,    vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
+
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
+
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
+
+       xsmaddadp       vs40,   vs0,    vs12
+       xsmaddadp       vs41,   vs1,    vs12
+
+       xsmaddadp       vs42,   vs0,    vs13
+       xsmaddadp       vs43,   vs1,    vs13
+
+       xsmaddadp       vs44,   vs0,    vs14
+       xsmaddadp       vs45,   vs1,    vs14
+
+       xsmaddadp       vs46,   vs0,    vs15
+       xsmaddadp       vs47,   vs1,    vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs34,   alpha_r
+       xsmuldp         vs1,    vs35,   alpha_r
+#else
+       xsmaddadp       vs0,    vs34,   alpha_r
+       xsmaddadp       vs1,    vs35,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs36,   alpha_r
+       xsmuldp         vs1,    vs37,   alpha_r
+#else
+       xsmaddadp       vs0,    vs36,   alpha_r
+       xsmaddadp       vs1,    vs37,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs38,   alpha_r
+       xsmuldp         vs1,    vs39,   alpha_r
+#else
+       xsmaddadp       vs0,    vs38,   alpha_r
+       xsmaddadp       vs1,    vs39,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs40,   alpha_r
+       xsmuldp         vs1,    vs41,   alpha_r
+#else
+       xsmaddadp       vs0,    vs40,   alpha_r
+       xsmaddadp       vs1,    vs41,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs42,   alpha_r
+       xsmuldp         vs1,    vs43,   alpha_r
+#else
+       xsmaddadp       vs0,    vs42,   alpha_r
+       xsmaddadp       vs1,    vs43,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs44,   alpha_r
+       xsmuldp         vs1,    vs45,   alpha_r
+#else
+       xsmaddadp       vs0,    vs44,   alpha_r
+       xsmaddadp       vs1,    vs45,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs46,   alpha_r
+       xsmuldp         vs1,    vs47,   alpha_r
+#else
+       xsmaddadp       vs0,    vs46,   alpha_r
+       xsmaddadp       vs1,    vs47,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+       xsmuldp         vs33,   vs0,    vs9
+
+       xsmuldp         vs34,   vs0,    vs10
+
+       xsmuldp         vs35,   vs0,    vs11
+
+       xsmuldp         vs36,   vs0,    vs12
+
+       xsmuldp         vs37,   vs0,    vs13
+
+       xsmuldp         vs38,   vs0,    vs14
+
+       xsmuldp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs20,   o0,     T1
+       lxsspx          vs21,   o4,     T1
+       lxsspx          vs22,   o8,     T1
+       lxsspx          vs23,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+       xsmaddadp       vs33,   vs0,    vs9
+
+       xsmaddadp       vs34,   vs0,    vs10
+
+       xsmaddadp       vs35,   vs0,    vs11
+
+       xsmaddadp       vs36,   vs0,    vs12
+
+       xsmaddadp       vs37,   vs0,    vs13
+
+       xsmaddadp       vs38,   vs0,    vs14
+
+       xsmaddadp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+       xsmaddadp       vs33,   vs4,    vs17
+
+       xsmaddadp       vs34,   vs4,    vs18
+
+       xsmaddadp       vs35,   vs4,    vs19
+
+       xsmaddadp       vs36,   vs4,    vs20
+
+       xsmaddadp       vs37,   vs4,    vs21
+
+       xsmaddadp       vs38,   vs4,    vs22
+
+       xsmaddadp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+       xsmaddadp       vs33,   vs4,    vs17
+
+       xsmaddadp       vs34,   vs4,    vs18
+
+       xsmaddadp       vs35,   vs4,    vs19
+
+       xsmaddadp       vs36,   vs4,    vs20
+
+       xsmaddadp       vs37,   vs4,    vs21
+
+       xsmaddadp       vs38,   vs4,    vs22
+
+       xsmaddadp       vs39,   vs4,    vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+       xsmuldp         vs33,   vs0,    vs9
+
+       xsmuldp         vs34,   vs0,    vs10
+
+       xsmuldp         vs35,   vs0,    vs11
+
+       xsmuldp         vs36,   vs0,    vs12
+
+       xsmuldp         vs37,   vs0,    vs13
+
+       xsmuldp         vs38,   vs0,    vs14
+
+       xsmuldp         vs39,   vs0,    vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            T1,     T1,     16
+
+       lxsspx          vs12,   o0,     T1
+       lxsspx          vs13,   o4,     T1
+       lxsspx          vs14,   o8,     T1
+       lxsspx          vs15,   o12,    T1
+
+       addi            BO,     BO,     32
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+       xsmaddadp       vs33,   vs0,    vs9
+
+       xsmaddadp       vs34,   vs0,    vs10
+
+       xsmaddadp       vs35,   vs0,    vs11
+
+       xsmaddadp       vs36,   vs0,    vs12
+
+       xsmaddadp       vs37,   vs0,    vs13
+
+       xsmaddadp       vs38,   vs0,    vs14
+
+       xsmaddadp       vs39,   vs0,    vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs34,   alpha_r
+#else
+       xsmaddadp       vs0,    vs34,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs35,   alpha_r
+#else
+       xsmaddadp       vs0,    vs35,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs36,   alpha_r
+#else
+       xsmaddadp       vs0,    vs36,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs37,   alpha_r
+#else
+       xsmaddadp       vs0,    vs37,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs38,   alpha_r
+#else
+       xsmaddadp       vs0,    vs38,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs39,   alpha_r
+#else
+       xsmaddadp       vs0,    vs39,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+       xvmaddasp       vs40,   vs4,    vs18
+       xvmaddasp       vs41,   vs5,    vs18
+       xvmaddasp       vs42,   vs6,    vs18
+       xvmaddasp       vs43,   vs7,    vs18
+
+       xvmaddasp       vs44,   vs4,    vs19
+       xvmaddasp       vs45,   vs5,    vs19
+       xvmaddasp       vs46,   vs6,    vs19
+       xvmaddasp       vs47,   vs7,    vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+       xvmulsp         vs40,   vs0,    vs10
+       xvmulsp         vs41,   vs1,    vs10
+       xvmulsp         vs42,   vs2,    vs10
+       xvmulsp         vs43,   vs3,    vs10
+
+       xvmulsp         vs44,   vs0,    vs11
+       xvmulsp         vs45,   vs1,    vs11
+       xvmulsp         vs46,   vs2,    vs11
+       xvmulsp         vs47,   vs3,    vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+       xvmaddasp       vs40,   vs0,    vs10
+       xvmaddasp       vs41,   vs1,    vs10
+       xvmaddasp       vs42,   vs2,    vs10
+       xvmaddasp       vs43,   vs3,    vs10
+
+       xvmaddasp       vs44,   vs0,    vs11
+       xvmaddasp       vs45,   vs1,    vs11
+       xvmaddasp       vs46,   vs2,    vs11
+       xvmaddasp       vs47,   vs3,    vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs40,   alpha_vr
+       xvmulsp         vs1,    vs41,   alpha_vr
+       xvmulsp         vs2,    vs42,   alpha_vr
+       xvmulsp         vs3,    vs43,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs40,   alpha_vr
+       xvmaddasp       vs1,    vs41,   alpha_vr
+       xvmaddasp       vs2,    vs42,   alpha_vr
+       xvmaddasp       vs3,    vs43,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs44,   alpha_vr
+       xvmulsp         vs1,    vs45,   alpha_vr
+       xvmulsp         vs2,    vs46,   alpha_vr
+       xvmulsp         vs3,    vs47,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs44,   alpha_vr
+       xvmaddasp       vs1,    vs45,   alpha_vr
+       xvmaddasp       vs2,    vs46,   alpha_vr
+       xvmaddasp       vs3,    vs47,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+       xvmaddasp       vs36,   vs4,    vs18
+       xvmaddasp       vs37,   vs5,    vs18
+
+       xvmaddasp       vs38,   vs4,    vs19
+       xvmaddasp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+       xvmulsp         vs36,   vs0,    vs10
+       xvmulsp         vs37,   vs1,    vs10
+
+       xvmulsp         vs38,   vs0,    vs11
+       xvmulsp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+       xvmaddasp       vs36,   vs0,    vs10
+       xvmaddasp       vs37,   vs1,    vs10
+
+       xvmaddasp       vs38,   vs0,    vs11
+       xvmaddasp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs38,   alpha_vr
+       xvmulsp         vs1,    vs39,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs38,   alpha_vr
+       xvmaddasp       vs1,    vs39,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+       xxspltw         vs18,   vs28,   2
+       xxspltw         vs19,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+       xvmaddasp       vs34,   vs4,    vs18
+
+       xvmaddasp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+       xvmulsp         vs34,   vs0,    vs10
+
+       xvmulsp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+       xxspltw         vs10,   vs28,   2
+       xxspltw         vs11,   vs28,   3
+
+       addi            BO,     BO,     16
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+       xvmaddasp       vs34,   vs0,    vs10
+
+       xvmaddasp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs34,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs34,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
+
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
+
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
+
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
+
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
+
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
+
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
+
+       xsmaddadp       vs36,   vs4,    vs18
+       xsmaddadp       vs37,   vs5,    vs18
+
+       xsmaddadp       vs38,   vs4,    vs19
+       xsmaddadp       vs39,   vs5,    vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
+
+       xsmuldp         vs36,   vs0,    vs10
+       xsmuldp         vs37,   vs1,    vs10
+
+       xsmuldp         vs38,   vs0,    vs11
+       xsmuldp         vs39,   vs1,    vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
+
+       xsmaddadp       vs36,   vs0,    vs10
+       xsmaddadp       vs37,   vs1,    vs10
+
+       xsmaddadp       vs38,   vs0,    vs11
+       xsmaddadp       vs39,   vs1,    vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs34,   alpha_r
+       xsmuldp         vs1,    vs35,   alpha_r
+#else
+       xsmaddadp       vs0,    vs34,   alpha_r
+       xsmaddadp       vs1,    vs35,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs36,   alpha_r
+       xsmuldp         vs1,    vs37,   alpha_r
+#else
+       xsmaddadp       vs0,    vs36,   alpha_r
+       xsmaddadp       vs1,    vs37,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs38,   alpha_r
+       xsmuldp         vs1,    vs39,   alpha_r
+#else
+       xsmaddadp       vs0,    vs38,   alpha_r
+       xsmaddadp       vs1,    vs39,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+       xsmuldp         vs33,   vs0,    vs9
+
+       xsmuldp         vs34,   vs0,    vs10
+
+       xsmuldp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+       lxsspx          vs18,   o8,     T1
+       lxsspx          vs19,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+       xsmaddadp       vs33,   vs0,    vs9
+
+       xsmaddadp       vs34,   vs0,    vs10
+
+       xsmaddadp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+       xsmaddadp       vs33,   vs4,    vs17
+
+       xsmaddadp       vs34,   vs4,    vs18
+
+       xsmaddadp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+       xsmaddadp       vs33,   vs4,    vs17
+
+       xsmaddadp       vs34,   vs4,    vs18
+
+       xsmaddadp       vs35,   vs4,    vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+       xsmuldp         vs33,   vs0,    vs9
+
+       xsmuldp         vs34,   vs0,    vs10
+
+       xsmuldp         vs35,   vs0,    vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+       lxsspx          vs10,   o8,     T1
+       lxsspx          vs11,   o12,    T1
+
+       addi            BO,     BO,     16
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+       xsmaddadp       vs33,   vs0,    vs9
+
+       xsmaddadp       vs34,   vs0,    vs10
+
+       xsmaddadp       vs35,   vs0,    vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs34,   alpha_r
+#else
+       xsmaddadp       vs0,    vs34,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs35,   alpha_r
+#else
+       xsmaddadp       vs0,    vs35,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+       xvmaddasp       vs36,   vs4,    vs17
+       xvmaddasp       vs37,   vs5,    vs17
+       xvmaddasp       vs38,   vs6,    vs17
+       xvmaddasp       vs39,   vs7,    vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+       xvmulsp         vs36,   vs0,    vs9
+       xvmulsp         vs37,   vs1,    vs9
+       xvmulsp         vs38,   vs2,    vs9
+       xvmulsp         vs39,   vs3,    vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+       xvmaddasp       vs36,   vs0,    vs9
+       xvmaddasp       vs37,   vs1,    vs9
+       xvmaddasp       vs38,   vs2,    vs9
+       xvmaddasp       vs39,   vs3,    vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs36,   alpha_vr
+       xvmulsp         vs1,    vs37,   alpha_vr
+       xvmulsp         vs2,    vs38,   alpha_vr
+       xvmulsp         vs3,    vs39,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs36,   alpha_vr
+       xvmaddasp       vs1,    vs37,   alpha_vr
+       xvmaddasp       vs2,    vs38,   alpha_vr
+       xvmaddasp       vs3,    vs39,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+       xvmaddasp       vs34,   vs4,    vs17
+       xvmaddasp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+       xvmulsp         vs34,   vs0,    vs9
+       xvmulsp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+       xvmaddasp       vs34,   vs0,    vs9
+       xvmaddasp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs34,   alpha_vr
+       xvmulsp         vs1,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs34,   alpha_vr
+       xvmaddasp       vs1,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+       xxspltw         vs17,   vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+       xvmaddasp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+       xvmulsp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+       xxspltw         vs9,    vs28,   1
+
+       addi            BO,     BO,     8
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+       xvmaddasp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+       xsmaddadp       vs34,   vs4,    vs17
+       xsmaddadp       vs35,   vs5,    vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+       xsmuldp         vs34,   vs0,    vs9
+       xsmuldp         vs35,   vs1,    vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+       xsmaddadp       vs34,   vs0,    vs9
+       xsmaddadp       vs35,   vs1,    vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs34,   alpha_r
+       xsmuldp         vs1,    vs35,   alpha_r
+#else
+       xsmaddadp       vs0,    vs34,   alpha_r
+       xsmaddadp       vs1,    vs35,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+       xsmuldp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+       lxsspx          vs17,   o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+       xsmaddadp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+       xsmaddadp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+       xsmaddadp       vs33,   vs4,    vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+       xsmuldp         vs33,   vs0,    vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+       lxsspx          vs9,    o4,     T1
+
+       addi            BO,     BO,     8
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+       xsmaddadp       vs33,   vs0,    vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+       lxvw4x          vs6,    o32,    AO
+       lxvw4x          vs7,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+       xvmaddasp       vs34,   vs6,    vs16
+       xvmaddasp       vs35,   vs7,    vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+       xvmulsp         vs34,   vs2,    vs8
+       xvmulsp         vs35,   vs3,    vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+       lxvw4x          vs2,    o32,    AO
+       lxvw4x          vs3,    o48,    AO
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+       xvmaddasp       vs34,   vs2,    vs8
+       xvmaddasp       vs35,   vs3,    vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+       lxvw4x          vs2,    o32,    T1
+       lxvw4x          vs3,    o48,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+       xvmulsp         vs2,    vs34,   alpha_vr
+       xvmulsp         vs3,    vs35,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+       xvmaddasp       vs2,    vs34,   alpha_vr
+       xvmaddasp       vs3,    vs35,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+       stxvw4x         vs2,    o32,    T1
+       stxvw4x         vs3,    o48,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+       lxvw4x          vs4,    o0,     AO
+       lxvw4x          vs5,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+       xvmaddasp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+       xvmulsp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+       lxvw4x          vs1,    o16,    AO
+
+       addi            AO,     AO,     32
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+       xvmaddasp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+       lxvw4x          vs1,    o16,    T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+       xvmulsp         vs1,    vs33,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+       xvmaddasp       vs1,    vs33,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+       stxvw4x         vs1,    o16,    T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+       lxvw4x          vs4,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs16,   vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+       xvmaddasp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmulsp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+       lxvw4x          vs0,    o0,     AO
+
+       addi            AO,     AO,     16
+
+       lxvw4x          vs28,   o0,     BO
+
+       xxspltw         vs8,    vs28,   0
+
+       addi            BO,     BO,     4
+
+
+       xvmaddasp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxvw4x          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xvmulsp         vs0,    vs32,   alpha_vr
+#else
+       xvmaddasp       vs0,    vs32,   alpha_vr
+#endif
+
+       stxvw4x         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+       lxsspx          vs4,    o0,     AO
+       lxsspx          vs5,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+       xsmaddadp       vs33,   vs5,    vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmuldp         vs32,   vs0,    vs8
+       xsmuldp         vs33,   vs1,    vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+       lxsspx          vs1,    o4,     AO
+
+       addi            AO,     AO,     8
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+       xsmaddadp       vs33,   vs1,    vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+       lxsspx          vs1,    o4,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+       xsmuldp         vs1,    vs33,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+       xsmaddadp       vs1,    vs33,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+       stxsspx         vs1,    o4,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+       lxsspx          vs4,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs16,   o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+       xsmaddadp       vs32,   vs4,    vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmuldp         vs32,   vs0,    vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+       lxsspx          vs0,    o0,     AO
+
+       addi            AO,     AO,     4
+
+       mr              T1,     BO
+
+       lxsspx          vs8,    o0,     T1
+
+       addi            BO,     BO,     4
+
+
+       xsmaddadp       vs32,   vs0,    vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+       mr              T1,     CO
+
+#ifndef TRMMKERNEL
+
+       lxsspx          vs0,    o0,     T1
+
+#endif
+
+#ifdef TRMMKERNEL
+       xsmuldp         vs0,    vs32,   alpha_r
+#else
+       xsmaddadp       vs0,    vs32,   alpha_r
+#endif
+
+       stxsspx         vs0,    o0,     T1
+
+       add             T1,     T1,     LDC
+
+       addi            CO,     CO,     4
+
+.endm
+
diff --git a/param.h b/param.h
index 370d10b..fb344cd 100644 (file)
--- a/param.h
+++ b/param.h
@@ -1964,7 +1964,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SNUMOPT                16
 #define DNUMOPT                8
 
-#define GEMM_DEFAULT_OFFSET_A  384
+#define GEMM_DEFAULT_OFFSET_A 131072 
 #define GEMM_DEFAULT_OFFSET_B 1024
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
@@ -1977,17 +1977,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P  480
+#define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
 #define CGEMM_DEFAULT_P  480
 #define ZGEMM_DEFAULT_P  240
 
-#define SGEMM_DEFAULT_Q  1440
+#define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
 #define CGEMM_DEFAULT_Q  720
 #define ZGEMM_DEFAULT_Q  360
 
-#define SGEMM_DEFAULT_R 28800
+#define SGEMM_DEFAULT_R 14400
 #define DGEMM_DEFAULT_R 14400
 #define CGEMM_DEFAULT_R 14400
 #define ZGEMM_DEFAULT_R 7200