updated cgemm- and ctrmm-kernel for POWER8
authorWerner Saar <wernsaar@googlemail.com>
Sun, 3 Apr 2016 12:30:49 +0000 (14:30 +0200)
committerWerner Saar <wernsaar@googlemail.com>
Sun, 3 Apr 2016 12:30:49 +0000 (14:30 +0200)
kernel/power/cgemm_kernel_8x4_power8.S
kernel/power/cgemm_logic_8x4_power8.S
kernel/power/cgemm_macros_8x4_power8.S
kernel/power/ctrmm_kernel_8x4_power8.S
kernel/power/ctrmm_logic_8x4_power8.S

index f732c81..a7e7066 100644 (file)
@@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
 /*********************************************************************/
@@ -130,10 +130,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define o0     0
-#define alpha_r vs30
-#define alpha_i vs31
 
-#define TBUFFER        r14
+#define alpha_dr vs28
+#define alpha_di vs29
+#define alpha_sr vs30
+#define alpha_si vs31
+
+
+#define NOTUSED        r14
 #define L      r15
 #define o12    r16
 #define o4     r17
@@ -271,21 +275,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cgemm_macros_8x4_power8.S"
 
        cmpwi   cr0, M, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, N, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, K, 0
-       ble     .L999_H1
+       ble     L999_H1
 
        slwi    LDC, LDC, ZBASE_SHIFT
-       li      PRE, 256 
+       li      PRE, 384 
        li      o4  , 4
        li      o8  , 8
        li      o12 , 12
        li      o16 , 16
        li      o32 , 32
        li      o48 , 48
-       addi    TBUFFER, SP, 360
        
 
 #ifdef __64BIT__
@@ -294,14 +297,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi    T1 , SP, 224
 #endif
 
-       lxsspx  alpha_r, 0, T1
-       lxsspx  alpha_i, o8, T1
+       stxsspx vs1,  0, T1
+        lxsspx  alpha_dr, 0, T1
+       stxsspx vs2,  o8  , T1
+        lxsspx  alpha_di, o8, T1
+        addi    T1, SP, 360
+        li      T2, 0
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_dr, o12, T1
+        lxvw4x          alpha_sr, o0 , T1
+        addi            T1, T1, 16
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_di, o12, T1
+        lxvw4x          alpha_si, o0 , T1
 
        .align 5
 
 #include "cgemm_logic_8x4_power8.S"
 
-.L999:
+L999:
        addi    r3, 0, 0
 
        lfd     f14,    0(SP)
index 51a0631..851a09a 100644 (file)
@@ -26,38 +26,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
-
        srawi.          J,      N,      2
-       ble             .LCGEMM_L4_END
+       ble             CGEMM_L4_END
 
-.LCGEMM_L4_BEGIN:
+CGEMM_L4_BEGIN:
 
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       2
        add             C,      C,      T1
        srawi.          I,      M,      3
-       ble             .LCGEMM_L4x8_END
+       ble             CGEMM_L4x8_END
 
-.LCGEMM_L4x8_BEGIN:
+CGEMM_L4x8_BEGIN:
 
 
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L4x8_SUB0
+       ble             CGEMM_L4x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L4x8_SUB4
+       ble             CGEMM_L4x8_SUB4
 
-.LCGEMM_L4x8_LOOP_START:
+CGEMM_L4x8_LOOP_START:
 
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        LOAD4x8_1
        KERNEL4x8_I1
        dcbt            AO,     PRE
@@ -68,17 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        KERNEL4x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL4x8_2
        KERNEL4x8_1
        dcbt            AO,     PRE
        KERNEL4x8_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L4x8_LOOP_END
+       ble             CGEMM_L4x8_LOOP_END
 
        .align 5
 
-.LCGEMM_L4x8_LOOP:
+CGEMM_L4x8_LOOP:
 
        KERNEL4x8_1
        dcbt            AO,     PRE
@@ -89,15 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        KERNEL4x8_1
        dcbt            AO,     PRE
+       dcbt            BO,     PRE
        KERNEL4x8_2
        KERNEL4x8_1
        dcbt            AO,     PRE
        KERNEL4x8_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x8_LOOP
+       bgt             CGEMM_L4x8_LOOP
 
-.LCGEMM_L4x8_LOOP_END:
+CGEMM_L4x8_LOOP_END:
 
        KERNEL4x8_1
        dcbt            AO,     PRE
@@ -112,9 +114,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_1
        KERNEL4x8_E2
 
-       b               .LCGEMM_L4x8_SUB1
+       b               CGEMM_L4x8_SUB1
 
-.LCGEMM_L4x8_SUB4:
+CGEMM_L4x8_SUB4:
 
        KERNEL4x8_SUBI1
        KERNEL4x8_SUB1
@@ -126,53 +128,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_SUB1
        KERNEL4x8_SUB1
 
-       b               .LCGEMM_L4x8_SUB1
+       b               CGEMM_L4x8_SUB1
 
-.LCGEMM_L4x8_SUB0:
+CGEMM_L4x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L4x8_SAVE
-       b               .LCGEMM_L4x8_SUB2
+       ble             CGEMM_L4x8_SAVE
+       b               CGEMM_L4x8_SUB2
 
-.LCGEMM_L4x8_SUB1:
+CGEMM_L4x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L4x8_SAVE
+       ble             CGEMM_L4x8_SAVE
 
-.LCGEMM_L4x8_SUB2:
+CGEMM_L4x8_SUB2:
 
        KERNEL4x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x8_SUB2
+       bgt             CGEMM_L4x8_SUB2
 
-.LCGEMM_L4x8_SAVE:
+CGEMM_L4x8_SAVE:
 
        SAVE4x8
 
        addic.          I,      I,      -1
-       bgt             .LCGEMM_L4x8_BEGIN
+       bgt             CGEMM_L4x8_BEGIN
 
-.LCGEMM_L4x8_END:
+CGEMM_L4x8_END:
 
-.LCGEMM_L4x4_BEGIN:
+CGEMM_L4x4_BEGIN:
 
        andi.           T2,     M,      7
-       ble             .LCGEMM_L4x1_END
+       ble             CGEMM_L4x1_END
 
        andi.           T1,     M,      4
-       ble             .LCGEMM_L4x4_END
+       ble             CGEMM_L4x4_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L4x4_SUB0
+       ble             CGEMM_L4x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L4x4_SUB4
+       ble             CGEMM_L4x4_SUB4
 
-.LCGEMM_L4x4_LOOP_START:
+CGEMM_L4x4_LOOP_START:
 
        LOAD4x4_1
        KERNEL4x4_I1
@@ -186,11 +188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L4x4_LOOP_END
+       ble             CGEMM_L4x4_LOOP_END
 
        .align 5
 
-.LCGEMM_L4x4_LOOP:
+CGEMM_L4x4_LOOP:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -203,9 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x4_LOOP
+       bgt             CGEMM_L4x4_LOOP
 
-.LCGEMM_L4x4_LOOP_END:
+CGEMM_L4x4_LOOP_END:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -217,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_1
        KERNEL4x4_E2
 
-       b               .LCGEMM_L4x4_SUB1
+       b               CGEMM_L4x4_SUB1
 
-.LCGEMM_L4x4_SUB4:
+CGEMM_L4x4_SUB4:
 
        KERNEL4x4_SUBI1
        KERNEL4x4_SUB1
@@ -231,48 +233,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_SUB1
        KERNEL4x4_SUB1
 
-       b               .LCGEMM_L4x4_SUB1
+       b               CGEMM_L4x4_SUB1
 
-.LCGEMM_L4x4_SUB0:
+CGEMM_L4x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L4x4_SAVE
-       b               .LCGEMM_L4x4_SUB2
+       ble             CGEMM_L4x4_SAVE
+       b               CGEMM_L4x4_SUB2
 
-.LCGEMM_L4x4_SUB1:
+CGEMM_L4x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L4x4_SAVE
+       ble             CGEMM_L4x4_SAVE
 
-.LCGEMM_L4x4_SUB2:
+CGEMM_L4x4_SUB2:
 
        KERNEL4x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x4_SUB2
+       bgt             CGEMM_L4x4_SUB2
 
-.LCGEMM_L4x4_SAVE:
+CGEMM_L4x4_SAVE:
 
        SAVE4x4
 
-.LCGEMM_L4x4_END:
+CGEMM_L4x4_END:
 
-.LCGEMM_L4x2_BEGIN:
+CGEMM_L4x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LCGEMM_L4x2_END
+       ble             CGEMM_L4x2_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L4x2_SUB0
+       ble             CGEMM_L4x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L4x2_SUB4
+       ble             CGEMM_L4x2_SUB4
 
-.LCGEMM_L4x2_LOOP_START:
+CGEMM_L4x2_LOOP_START:
 
        LOAD4x2_1
        KERNEL4x2_I1
@@ -286,11 +288,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L4x2_LOOP_END
+       ble             CGEMM_L4x2_LOOP_END
 
        .align 5
 
-.LCGEMM_L4x2_LOOP:
+CGEMM_L4x2_LOOP:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -303,9 +305,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x2_LOOP
+       bgt             CGEMM_L4x2_LOOP
 
-.LCGEMM_L4x2_LOOP_END:
+CGEMM_L4x2_LOOP_END:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -317,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_1
        KERNEL4x2_E2
 
-       b               .LCGEMM_L4x2_SUB1
+       b               CGEMM_L4x2_SUB1
 
-.LCGEMM_L4x2_SUB4:
+CGEMM_L4x2_SUB4:
 
        KERNEL4x2_SUBI1
        KERNEL4x2_SUB1
@@ -331,48 +333,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB1
        KERNEL4x2_SUB1
 
-       b               .LCGEMM_L4x2_SUB1
+       b               CGEMM_L4x2_SUB1
 
-.LCGEMM_L4x2_SUB0:
+CGEMM_L4x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L4x2_SAVE
-       b               .LCGEMM_L4x2_SUB2
+       ble             CGEMM_L4x2_SAVE
+       b               CGEMM_L4x2_SUB2
 
-.LCGEMM_L4x2_SUB1:
+CGEMM_L4x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L4x2_SAVE
+       ble             CGEMM_L4x2_SAVE
 
-.LCGEMM_L4x2_SUB2:
+CGEMM_L4x2_SUB2:
 
        KERNEL4x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x2_SUB2
+       bgt             CGEMM_L4x2_SUB2
 
-.LCGEMM_L4x2_SAVE:
+CGEMM_L4x2_SAVE:
 
        SAVE4x2
 
-.LCGEMM_L4x2_END:
+CGEMM_L4x2_END:
 
-.LCGEMM_L4x1_BEGIN:
+CGEMM_L4x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LCGEMM_L4x1_END
+       ble             CGEMM_L4x1_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L4x1_SUB0
+       ble             CGEMM_L4x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L4x1_SUB4
+       ble             CGEMM_L4x1_SUB4
 
-.LCGEMM_L4x1_LOOP_START:
+CGEMM_L4x1_LOOP_START:
 
        LOAD4x1_1
        KERNEL4x1_I1
@@ -386,11 +388,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L4x1_LOOP_END
+       ble             CGEMM_L4x1_LOOP_END
 
        .align 5
 
-.LCGEMM_L4x1_LOOP:
+CGEMM_L4x1_LOOP:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -403,9 +405,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x1_LOOP
+       bgt             CGEMM_L4x1_LOOP
 
-.LCGEMM_L4x1_LOOP_END:
+CGEMM_L4x1_LOOP_END:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -417,9 +419,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_1
        KERNEL4x1_E2
 
-       b               .LCGEMM_L4x1_SUB1
+       b               CGEMM_L4x1_SUB1
 
-.LCGEMM_L4x1_SUB4:
+CGEMM_L4x1_SUB4:
 
        KERNEL4x1_SUBI1
        KERNEL4x1_SUB1
@@ -431,74 +433,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_SUB1
        KERNEL4x1_SUB1
 
-       b               .LCGEMM_L4x1_SUB1
+       b               CGEMM_L4x1_SUB1
 
-.LCGEMM_L4x1_SUB0:
+CGEMM_L4x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL4x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L4x1_SAVE
-       b               .LCGEMM_L4x1_SUB2
+       ble             CGEMM_L4x1_SAVE
+       b               CGEMM_L4x1_SUB2
 
-.LCGEMM_L4x1_SUB1:
+CGEMM_L4x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L4x1_SAVE
+       ble             CGEMM_L4x1_SAVE
 
-.LCGEMM_L4x1_SUB2:
+CGEMM_L4x1_SUB2:
 
        KERNEL4x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L4x1_SUB2
+       bgt             CGEMM_L4x1_SUB2
 
-.LCGEMM_L4x1_SAVE:
+CGEMM_L4x1_SAVE:
 
        SAVE4x1
 
-.LCGEMM_L4x1_END:
+CGEMM_L4x1_END:
 
        slwi            T1,     K,      5
        add             B,      B,      T1
 
        addic.          J,      J,      -1
-       bgt             .LCGEMM_L4_BEGIN
+       bgt             CGEMM_L4_BEGIN
 
        andi.           T2,     N,      3
-       ble             .L999_H2
+       ble             L999_H2
 
-.LCGEMM_L4_END:
+CGEMM_L4_END:
 
-       b               .LCGEMM_L2_BEGIN
+       b               CGEMM_L2_BEGIN
 
-.L999_H1:
+L999_H1:
 
-       b               .L999_H2
+       b               L999_H2
 
-.LCGEMM_L2_BEGIN:
+CGEMM_L2_BEGIN:
 
        andi.           T1,     N,      2
-       ble             .LCGEMM_L2_END
+       ble             CGEMM_L2_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       1
        add             C,      C,      T1
        srawi.          I,      M,      3
-       ble             .LCGEMM_L2x8_END
+       ble             CGEMM_L2x8_END
 
-.LCGEMM_L2x8_BEGIN:
+CGEMM_L2x8_BEGIN:
 
 
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L2x8_SUB0
+       ble             CGEMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L2x8_SUB4
+       ble             CGEMM_L2x8_SUB4
 
-.LCGEMM_L2x8_LOOP_START:
+CGEMM_L2x8_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD2x8_1
@@ -517,11 +519,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L2x8_LOOP_END
+       ble             CGEMM_L2x8_LOOP_END
 
        .align 5
 
-.LCGEMM_L2x8_LOOP:
+CGEMM_L2x8_LOOP:
 
        KERNEL2x8_1
        dcbt            AO,     PRE
@@ -538,9 +540,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x8_LOOP
+       bgt             CGEMM_L2x8_LOOP
 
-.LCGEMM_L2x8_LOOP_END:
+CGEMM_L2x8_LOOP_END:
 
        KERNEL2x8_1
        dcbt            AO,     PRE
@@ -555,9 +557,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_1
        KERNEL2x8_E2
 
-       b               .LCGEMM_L2x8_SUB1
+       b               CGEMM_L2x8_SUB1
 
-.LCGEMM_L2x8_SUB4:
+CGEMM_L2x8_SUB4:
 
        KERNEL2x8_SUBI1
        KERNEL2x8_SUB1
@@ -569,53 +571,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_SUB1
        KERNEL2x8_SUB1
 
-       b               .LCGEMM_L2x8_SUB1
+       b               CGEMM_L2x8_SUB1
 
-.LCGEMM_L2x8_SUB0:
+CGEMM_L2x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L2x8_SAVE
-       b               .LCGEMM_L2x8_SUB2
+       ble             CGEMM_L2x8_SAVE
+       b               CGEMM_L2x8_SUB2
 
-.LCGEMM_L2x8_SUB1:
+CGEMM_L2x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L2x8_SAVE
+       ble             CGEMM_L2x8_SAVE
 
-.LCGEMM_L2x8_SUB2:
+CGEMM_L2x8_SUB2:
 
        KERNEL2x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x8_SUB2
+       bgt             CGEMM_L2x8_SUB2
 
-.LCGEMM_L2x8_SAVE:
+CGEMM_L2x8_SAVE:
 
        SAVE2x8
 
        addic.          I,      I,      -1
-       bgt             .LCGEMM_L2x8_BEGIN
+       bgt             CGEMM_L2x8_BEGIN
 
-.LCGEMM_L2x8_END:
+CGEMM_L2x8_END:
 
-.LCGEMM_L2x4_BEGIN:
+CGEMM_L2x4_BEGIN:
 
        andi.           T2,     M,      7
-       ble             .LCGEMM_L2x1_END
+       ble             CGEMM_L2x1_END
 
        andi.           T1,     M,      4
-       ble             .LCGEMM_L2x4_END
+       ble             CGEMM_L2x4_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L2x4_SUB0
+       ble             CGEMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L2x4_SUB4
+       ble             CGEMM_L2x4_SUB4
 
-.LCGEMM_L2x4_LOOP_START:
+CGEMM_L2x4_LOOP_START:
 
        LOAD2x4_1
        KERNEL2x4_I1
@@ -629,11 +631,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L2x4_LOOP_END
+       ble             CGEMM_L2x4_LOOP_END
 
        .align 5
 
-.LCGEMM_L2x4_LOOP:
+CGEMM_L2x4_LOOP:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -646,9 +648,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x4_LOOP
+       bgt             CGEMM_L2x4_LOOP
 
-.LCGEMM_L2x4_LOOP_END:
+CGEMM_L2x4_LOOP_END:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -660,9 +662,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_1
        KERNEL2x4_E2
 
-       b               .LCGEMM_L2x4_SUB1
+       b               CGEMM_L2x4_SUB1
 
-.LCGEMM_L2x4_SUB4:
+CGEMM_L2x4_SUB4:
 
        KERNEL2x4_SUBI1
        KERNEL2x4_SUB1
@@ -674,48 +676,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
 
-       b               .LCGEMM_L2x4_SUB1
+       b               CGEMM_L2x4_SUB1
 
-.LCGEMM_L2x4_SUB0:
+CGEMM_L2x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L2x4_SAVE
-       b               .LCGEMM_L2x4_SUB2
+       ble             CGEMM_L2x4_SAVE
+       b               CGEMM_L2x4_SUB2
 
-.LCGEMM_L2x4_SUB1:
+CGEMM_L2x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L2x4_SAVE
+       ble             CGEMM_L2x4_SAVE
 
-.LCGEMM_L2x4_SUB2:
+CGEMM_L2x4_SUB2:
 
        KERNEL2x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x4_SUB2
+       bgt             CGEMM_L2x4_SUB2
 
-.LCGEMM_L2x4_SAVE:
+CGEMM_L2x4_SAVE:
 
        SAVE2x4
 
-.LCGEMM_L2x4_END:
+CGEMM_L2x4_END:
 
-.LCGEMM_L2x2_BEGIN:
+CGEMM_L2x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LCGEMM_L2x2_END
+       ble             CGEMM_L2x2_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L2x2_SUB0
+       ble             CGEMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L2x2_SUB4
+       ble             CGEMM_L2x2_SUB4
 
-.LCGEMM_L2x2_LOOP_START:
+CGEMM_L2x2_LOOP_START:
 
        LOAD2x2_1
        KERNEL2x2_I1
@@ -729,11 +731,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L2x2_LOOP_END
+       ble             CGEMM_L2x2_LOOP_END
 
        .align 5
 
-.LCGEMM_L2x2_LOOP:
+CGEMM_L2x2_LOOP:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -746,9 +748,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x2_LOOP
+       bgt             CGEMM_L2x2_LOOP
 
-.LCGEMM_L2x2_LOOP_END:
+CGEMM_L2x2_LOOP_END:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -760,9 +762,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_1
        KERNEL2x2_E2
 
-       b               .LCGEMM_L2x2_SUB1
+       b               CGEMM_L2x2_SUB1
 
-.LCGEMM_L2x2_SUB4:
+CGEMM_L2x2_SUB4:
 
        KERNEL2x2_SUBI1
        KERNEL2x2_SUB1
@@ -774,48 +776,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
 
-       b               .LCGEMM_L2x2_SUB1
+       b               CGEMM_L2x2_SUB1
 
-.LCGEMM_L2x2_SUB0:
+CGEMM_L2x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L2x2_SAVE
-       b               .LCGEMM_L2x2_SUB2
+       ble             CGEMM_L2x2_SAVE
+       b               CGEMM_L2x2_SUB2
 
-.LCGEMM_L2x2_SUB1:
+CGEMM_L2x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L2x2_SAVE
+       ble             CGEMM_L2x2_SAVE
 
-.LCGEMM_L2x2_SUB2:
+CGEMM_L2x2_SUB2:
 
        KERNEL2x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x2_SUB2
+       bgt             CGEMM_L2x2_SUB2
 
-.LCGEMM_L2x2_SAVE:
+CGEMM_L2x2_SAVE:
 
        SAVE2x2
 
-.LCGEMM_L2x2_END:
+CGEMM_L2x2_END:
 
-.LCGEMM_L2x1_BEGIN:
+CGEMM_L2x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LCGEMM_L2x1_END
+       ble             CGEMM_L2x1_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L2x1_SUB0
+       ble             CGEMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L2x1_SUB4
+       ble             CGEMM_L2x1_SUB4
 
-.LCGEMM_L2x1_LOOP_START:
+CGEMM_L2x1_LOOP_START:
 
        LOAD2x1_1
        KERNEL2x1_I1
@@ -829,11 +831,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L2x1_LOOP_END
+       ble             CGEMM_L2x1_LOOP_END
 
        .align 5
 
-.LCGEMM_L2x1_LOOP:
+CGEMM_L2x1_LOOP:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -846,9 +848,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x1_LOOP
+       bgt             CGEMM_L2x1_LOOP
 
-.LCGEMM_L2x1_LOOP_END:
+CGEMM_L2x1_LOOP_END:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -860,9 +862,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_1
        KERNEL2x1_E2
 
-       b               .LCGEMM_L2x1_SUB1
+       b               CGEMM_L2x1_SUB1
 
-.LCGEMM_L2x1_SUB4:
+CGEMM_L2x1_SUB4:
 
        KERNEL2x1_SUBI1
        KERNEL2x1_SUB1
@@ -874,66 +876,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
 
-       b               .LCGEMM_L2x1_SUB1
+       b               CGEMM_L2x1_SUB1
 
-.LCGEMM_L2x1_SUB0:
+CGEMM_L2x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL2x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L2x1_SAVE
-       b               .LCGEMM_L2x1_SUB2
+       ble             CGEMM_L2x1_SAVE
+       b               CGEMM_L2x1_SUB2
 
-.LCGEMM_L2x1_SUB1:
+CGEMM_L2x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L2x1_SAVE
+       ble             CGEMM_L2x1_SAVE
 
-.LCGEMM_L2x1_SUB2:
+CGEMM_L2x1_SUB2:
 
        KERNEL2x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L2x1_SUB2
+       bgt             CGEMM_L2x1_SUB2
 
-.LCGEMM_L2x1_SAVE:
+CGEMM_L2x1_SAVE:
 
        SAVE2x1
 
-.LCGEMM_L2x1_END:
+CGEMM_L2x1_END:
 
        slwi            T1,     K,      4
        add             B,      B,      T1
 
-.LCGEMM_L2_END:
+CGEMM_L2_END:
 
-       b               .LCGEMM_L1_BEGIN
+       b               CGEMM_L1_BEGIN
 
-.L999_H2:
+L999_H2:
 
-       b               .L999
+       b               L999
 
-.LCGEMM_L1_BEGIN:
+CGEMM_L1_BEGIN:
 
        andi.           T1,     N,      1
-       ble             .LCGEMM_L1_END
+       ble             CGEMM_L1_END
        mr              CO,     C
        mr              AO,     A
        srawi.          I,      M,      3
-       ble             .LCGEMM_L1x8_END
+       ble             CGEMM_L1x8_END
 
-.LCGEMM_L1x8_BEGIN:
+CGEMM_L1x8_BEGIN:
 
 
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L1x8_SUB0
+       ble             CGEMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L1x8_SUB4
+       ble             CGEMM_L1x8_SUB4
 
-.LCGEMM_L1x8_LOOP_START:
+CGEMM_L1x8_LOOP_START:
 
        dcbt            AO,     PRE
        LOAD1x8_1
@@ -952,11 +954,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L1x8_LOOP_END
+       ble             CGEMM_L1x8_LOOP_END
 
        .align 5
 
-.LCGEMM_L1x8_LOOP:
+CGEMM_L1x8_LOOP:
 
        KERNEL1x8_1
        dcbt            AO,     PRE
@@ -973,9 +975,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x8_LOOP
+       bgt             CGEMM_L1x8_LOOP
 
-.LCGEMM_L1x8_LOOP_END:
+CGEMM_L1x8_LOOP_END:
 
        KERNEL1x8_1
        dcbt            AO,     PRE
@@ -990,9 +992,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_1
        KERNEL1x8_E2
 
-       b               .LCGEMM_L1x8_SUB1
+       b               CGEMM_L1x8_SUB1
 
-.LCGEMM_L1x8_SUB4:
+CGEMM_L1x8_SUB4:
 
        KERNEL1x8_SUBI1
        KERNEL1x8_SUB1
@@ -1004,53 +1006,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_SUB1
        KERNEL1x8_SUB1
 
-       b               .LCGEMM_L1x8_SUB1
+       b               CGEMM_L1x8_SUB1
 
-.LCGEMM_L1x8_SUB0:
+CGEMM_L1x8_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L1x8_SAVE
-       b               .LCGEMM_L1x8_SUB2
+       ble             CGEMM_L1x8_SAVE
+       b               CGEMM_L1x8_SUB2
 
-.LCGEMM_L1x8_SUB1:
+CGEMM_L1x8_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L1x8_SAVE
+       ble             CGEMM_L1x8_SAVE
 
-.LCGEMM_L1x8_SUB2:
+CGEMM_L1x8_SUB2:
 
        KERNEL1x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x8_SUB2
+       bgt             CGEMM_L1x8_SUB2
 
-.LCGEMM_L1x8_SAVE:
+CGEMM_L1x8_SAVE:
 
        SAVE1x8
 
        addic.          I,      I,      -1
-       bgt             .LCGEMM_L1x8_BEGIN
+       bgt             CGEMM_L1x8_BEGIN
 
-.LCGEMM_L1x8_END:
+CGEMM_L1x8_END:
 
-.LCGEMM_L1x4_BEGIN:
+CGEMM_L1x4_BEGIN:
 
        andi.           T2,     M,      7
-       ble             .LCGEMM_L1x1_END
+       ble             CGEMM_L1x1_END
 
        andi.           T1,     M,      4
-       ble             .LCGEMM_L1x4_END
+       ble             CGEMM_L1x4_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L1x4_SUB0
+       ble             CGEMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L1x4_SUB4
+       ble             CGEMM_L1x4_SUB4
 
-.LCGEMM_L1x4_LOOP_START:
+CGEMM_L1x4_LOOP_START:
 
        LOAD1x4_1
        KERNEL1x4_I1
@@ -1064,11 +1066,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L1x4_LOOP_END
+       ble             CGEMM_L1x4_LOOP_END
 
        .align 5
 
-.LCGEMM_L1x4_LOOP:
+CGEMM_L1x4_LOOP:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1081,9 +1083,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x4_LOOP
+       bgt             CGEMM_L1x4_LOOP
 
-.LCGEMM_L1x4_LOOP_END:
+CGEMM_L1x4_LOOP_END:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1095,9 +1097,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_1
        KERNEL1x4_E2
 
-       b               .LCGEMM_L1x4_SUB1
+       b               CGEMM_L1x4_SUB1
 
-.LCGEMM_L1x4_SUB4:
+CGEMM_L1x4_SUB4:
 
        KERNEL1x4_SUBI1
        KERNEL1x4_SUB1
@@ -1109,48 +1111,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
 
-       b               .LCGEMM_L1x4_SUB1
+       b               CGEMM_L1x4_SUB1
 
-.LCGEMM_L1x4_SUB0:
+CGEMM_L1x4_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L1x4_SAVE
-       b               .LCGEMM_L1x4_SUB2
+       ble             CGEMM_L1x4_SAVE
+       b               CGEMM_L1x4_SUB2
 
-.LCGEMM_L1x4_SUB1:
+CGEMM_L1x4_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L1x4_SAVE
+       ble             CGEMM_L1x4_SAVE
 
-.LCGEMM_L1x4_SUB2:
+CGEMM_L1x4_SUB2:
 
        KERNEL1x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x4_SUB2
+       bgt             CGEMM_L1x4_SUB2
 
-.LCGEMM_L1x4_SAVE:
+CGEMM_L1x4_SAVE:
 
        SAVE1x4
 
-.LCGEMM_L1x4_END:
+CGEMM_L1x4_END:
 
-.LCGEMM_L1x2_BEGIN:
+CGEMM_L1x2_BEGIN:
 
 
        andi.           T1,     M,      2
-       ble             .LCGEMM_L1x2_END
+       ble             CGEMM_L1x2_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L1x2_SUB0
+       ble             CGEMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L1x2_SUB4
+       ble             CGEMM_L1x2_SUB4
 
-.LCGEMM_L1x2_LOOP_START:
+CGEMM_L1x2_LOOP_START:
 
        LOAD1x2_1
        KERNEL1x2_I1
@@ -1164,11 +1166,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L1x2_LOOP_END
+       ble             CGEMM_L1x2_LOOP_END
 
        .align 5
 
-.LCGEMM_L1x2_LOOP:
+CGEMM_L1x2_LOOP:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -1181,9 +1183,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x2_LOOP
+       bgt             CGEMM_L1x2_LOOP
 
-.LCGEMM_L1x2_LOOP_END:
+CGEMM_L1x2_LOOP_END:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -1195,9 +1197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_1
        KERNEL1x2_E2
 
-       b               .LCGEMM_L1x2_SUB1
+       b               CGEMM_L1x2_SUB1
 
-.LCGEMM_L1x2_SUB4:
+CGEMM_L1x2_SUB4:
 
        KERNEL1x2_SUBI1
        KERNEL1x2_SUB1
@@ -1209,48 +1211,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
 
-       b               .LCGEMM_L1x2_SUB1
+       b               CGEMM_L1x2_SUB1
 
-.LCGEMM_L1x2_SUB0:
+CGEMM_L1x2_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L1x2_SAVE
-       b               .LCGEMM_L1x2_SUB2
+       ble             CGEMM_L1x2_SAVE
+       b               CGEMM_L1x2_SUB2
 
-.LCGEMM_L1x2_SUB1:
+CGEMM_L1x2_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L1x2_SAVE
+       ble             CGEMM_L1x2_SAVE
 
-.LCGEMM_L1x2_SUB2:
+CGEMM_L1x2_SUB2:
 
        KERNEL1x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x2_SUB2
+       bgt             CGEMM_L1x2_SUB2
 
-.LCGEMM_L1x2_SAVE:
+CGEMM_L1x2_SAVE:
 
        SAVE1x2
 
-.LCGEMM_L1x2_END:
+CGEMM_L1x2_END:
 
-.LCGEMM_L1x1_BEGIN:
+CGEMM_L1x1_BEGIN:
 
 
        andi.           T1,     M,      1
-       ble             .LCGEMM_L1x1_END
+       ble             CGEMM_L1x1_END
        mr              BO,     B
        srawi.          L,      K,      3
-       ble             .LCGEMM_L1x1_SUB0
+       ble             CGEMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCGEMM_L1x1_SUB4
+       ble             CGEMM_L1x1_SUB4
 
-.LCGEMM_L1x1_LOOP_START:
+CGEMM_L1x1_LOOP_START:
 
        LOAD1x1_1
        KERNEL1x1_I1
@@ -1264,11 +1266,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -2
-       ble             .LCGEMM_L1x1_LOOP_END
+       ble             CGEMM_L1x1_LOOP_END
 
        .align 5
 
-.LCGEMM_L1x1_LOOP:
+CGEMM_L1x1_LOOP:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -1281,9 +1283,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x1_LOOP
+       bgt             CGEMM_L1x1_LOOP
 
-.LCGEMM_L1x1_LOOP_END:
+CGEMM_L1x1_LOOP_END:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -1295,9 +1297,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_1
        KERNEL1x1_E2
 
-       b               .LCGEMM_L1x1_SUB1
+       b               CGEMM_L1x1_SUB1
 
-.LCGEMM_L1x1_SUB4:
+CGEMM_L1x1_SUB4:
 
        KERNEL1x1_SUBI1
        KERNEL1x1_SUB1
@@ -1309,34 +1311,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
 
-       b               .LCGEMM_L1x1_SUB1
+       b               CGEMM_L1x1_SUB1
 
-.LCGEMM_L1x1_SUB0:
+CGEMM_L1x1_SUB0:
 
        andi.           L,      K,      7
 
        KERNEL1x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCGEMM_L1x1_SAVE
-       b               .LCGEMM_L1x1_SUB2
+       ble             CGEMM_L1x1_SAVE
+       b               CGEMM_L1x1_SUB2
 
-.LCGEMM_L1x1_SUB1:
+CGEMM_L1x1_SUB1:
 
        andi.           L,      K,      7
-       ble             .LCGEMM_L1x1_SAVE
+       ble             CGEMM_L1x1_SAVE
 
-.LCGEMM_L1x1_SUB2:
+CGEMM_L1x1_SUB2:
 
        KERNEL1x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCGEMM_L1x1_SUB2
+       bgt             CGEMM_L1x1_SUB2
 
-.LCGEMM_L1x1_SAVE:
+CGEMM_L1x1_SAVE:
 
        SAVE1x1
 
-.LCGEMM_L1x1_END:
+CGEMM_L1x1_END:
 
-.LCGEMM_L1_END:
+CGEMM_L1_END:
index 2085d37..48a2125 100644 (file)
@@ -26,40 +26,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
-       #define XSFADD_R1   xsaddsp
-       #define XSFADD_R2   xssubsp
-       #define XSFADD_I1   xsaddsp
-       #define XSFADD_I2   xsaddsp
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xssubdp
+       #define XSFADD_I1   xsadddp
+       #define XSFADD_I2   xsadddp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvsubsp
+       #define XVFADD_I1   xvaddsp
+       #define XVFADD_I2   xvaddsp
 
 #elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
 
-       #define XSFADD_R1   xsaddsp
-       #define XSFADD_R2   xsaddsp
-       #define XSFADD_I1   xssubsp
-       #define XSFADD_I2   xsaddsp
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xsadddp
+       #define XSFADD_I1   xssubdp
+       #define XSFADD_I2   xsadddp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvaddsp
+       #define XVFADD_I1   xvsubsp
+       #define XVFADD_I2   xvaddsp
 
 #elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
 
-       #define XSFADD_R1   xsaddsp
-       #define XSFADD_R2   xsaddsp
-       #define XSFADD_I1   xsaddsp
-       #define XSFADD_I2   xssubsp
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xsadddp
+       #define XSFADD_I1   xsadddp
+       #define XSFADD_I2   xssubdp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvaddsp
+       #define XVFADD_I1   xvaddsp
+       #define XVFADD_I2   xvsubsp
 
 #else             // CC || CR || RC || RR
 
-       #define XSFADD_R1   xsaddsp
-       #define XSFADD_R2   xssubsp
-       #define XSFADD_I1   xssubsp
-       #define XSFADD_I2   xssubsp
+       #define XSFADD_R1   xsadddp
+       #define XSFADD_R2   xssubdp
+       #define XSFADD_I1   xssubdp
+       #define XSFADD_I2   xssubdp
+       #define XVFADD_R1   xvaddsp
+       #define XVFADD_R2   xvsubsp
+       #define XVFADD_I1   xvsubsp
+       #define XVFADD_I2   xvsubsp
 
 #endif
 
@@ -172,24 +188,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x8_1
 
-       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
-       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
        lxvw4x          vs4,    o0,     AO              // load a0, a1
 
-       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
-       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
-
-       lxvw4x          vs25,   o16,    BO              //  load b2, b3
        lxvw4x          vs5,    o16,    AO              // load a2, a3
 
-       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
-       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
-
        lxvw4x          vs6,    o32,    AO              // load a4, a5
+
        lxvw4x          vs7,    o48,    AO              // load a6, a7
 
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs16,   vs24,   0
+       xxspltw         vs17,   vs24,   1
+       xxspltw         vs18,   vs24,   2
+       xxspltw         vs19,   vs24,   3
+
+       lxvw4x          vs25,   o16,    BO              //  load b2, b3
+
+       xxspltw         vs20,   vs25,   0
+       xxspltw         vs21,   vs25,   1
+       xxspltw         vs22,   vs25,   2
+       xxspltw         vs23,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs0,    vs9             // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs1,    vs8             // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs1,    vs9             // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs36,   vs2,    vs8             // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs37,   vs2,    vs9             // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
        xvmaddasp       vs38,   vs3,    vs8             // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs39,   vs3,    vs9             // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
@@ -211,47 +245,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmaddasp       vs54,   vs3,    vs12            // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
        xvmaddasp       vs55,   vs3,    vs13            // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
 
-       xxspltw         vs16,   vs24,   0
-       xxspltw         vs17,   vs24,   1
-       xxspltw         vs18,   vs24,   2
-       xxspltw         vs19,   vs24,   3
-
        xvmaddasp       vs56,   vs0,    vs14            // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs57,   vs0,    vs15            // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
        xvmaddasp       vs58,   vs1,    vs14            // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs59,   vs1,    vs15            // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
-       addi            BO,     BO,     32
        xvmaddasp       vs60,   vs2,    vs14            // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs61,   vs2,    vs15            // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
-       addi            AO,     AO,     64
        xvmaddasp       vs62,   vs3,    vs14            // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs63,   vs3,    vs15            // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
-       xxspltw         vs20,   vs25,   0
-       xxspltw         vs21,   vs25,   1
-       xxspltw         vs22,   vs25,   2
-       xxspltw         vs23,   vs25,   3
 
 .endm
 
 .macro KERNEL4x8_2
 
-       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
-       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
-       lxvw4x          vs24,   o0,     BO              //  load b0, b1
        lxvw4x          vs0,    o0,     AO              // load a0, a1
 
-       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
-       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       lxvw4x          vs1,    o16,    AO              // load a2, a3
+
+       lxvw4x          vs2,    o32,    AO              // load a4, a5
+
+       lxvw4x          vs3,    o48,    AO              // load a6, a7
+
+
+       addi            AO,     AO,     64
+
+       lxvw4x          vs24,   o0,     BO              //  load b0, b1
+
+       xxspltw         vs8,    vs24,   0
+       xxspltw         vs9,    vs24,   1
+       xxspltw         vs10,   vs24,   2
+       xxspltw         vs11,   vs24,   3
 
        lxvw4x          vs25,   o16,    BO              //  load b2, b3
-       lxvw4x          vs1,    o16,    AO              // load a2, a3
 
+       xxspltw         vs12,   vs25,   0
+       xxspltw         vs13,   vs25,   1
+       xxspltw         vs14,   vs25,   2
+       xxspltw         vs15,   vs25,   3
+
+
+       addi            BO,     BO,     32
+
+
+       xvmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs33,   vs4,    vs17            // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+       xvmaddasp       vs34,   vs5,    vs16            // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+       xvmaddasp       vs35,   vs5,    vs17            // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
        xvmaddasp       vs36,   vs6,    vs16            // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs37,   vs6,    vs17            // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
-       lxvw4x          vs2,    o32,    AO              // load a4, a5
-       lxvw4x          vs3,    o48,    AO              // load a6, a7
        xvmaddasp       vs38,   vs7,    vs16            // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
        xvmaddasp       vs39,   vs7,    vs17            // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
@@ -273,26 +316,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xvmaddasp       vs54,   vs7,    vs20            // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
        xvmaddasp       vs55,   vs7,    vs21            // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
 
-       xxspltw         vs8,    vs24,   0
-       xxspltw         vs9,    vs24,   1
-       xxspltw         vs10,   vs24,   2
-       xxspltw         vs11,   vs24,   3
-
        xvmaddasp       vs56,   vs4,    vs22            // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs57,   vs4,    vs23            // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
        xvmaddasp       vs58,   vs5,    vs22            // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs59,   vs5,    vs23            // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
-       addi            AO,     AO,     64
        xvmaddasp       vs60,   vs6,    vs22            // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs61,   vs6,    vs23            // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
-       addi            BO,     BO,     32
        xvmaddasp       vs62,   vs7,    vs22            // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
        xvmaddasp       vs63,   vs7,    vs23            // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
-       xxspltw         vs12,   vs25,   0
-       xxspltw         vs13,   vs25,   1
-       xxspltw         vs14,   vs25,   2
-       xxspltw         vs15,   vs25,   3
 
 .endm
 
@@ -501,51 +533,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -566,51 +599,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -631,51 +665,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs36,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
 
-       stxvw4x         vs37,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -696,51 +731,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -767,51 +803,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs40,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs40,   0
+       xxspltw         vs9,    vs40,   1
+       xxspltw         vs10,   vs40,   2
+       xxspltw         vs11,   vs40,   3
 
-       stxvw4x         vs41,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs41,   0
+       xxspltw         vs13,   vs41,   1
+       xxspltw         vs14,   vs41,   2
+       xxspltw         vs15,   vs41,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -832,51 +869,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs42,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs42,   0
+       xxspltw         vs9,    vs42,   1
+       xxspltw         vs10,   vs42,   2
+       xxspltw         vs11,   vs42,   3
 
-       stxvw4x         vs43,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs43,   0
+       xxspltw         vs13,   vs43,   1
+       xxspltw         vs14,   vs43,   2
+       xxspltw         vs15,   vs43,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -897,51 +935,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs44,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs44,   0
+       xxspltw         vs9,    vs44,   1
+       xxspltw         vs10,   vs44,   2
+       xxspltw         vs11,   vs44,   3
 
-       stxvw4x         vs45,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs45,   0
+       xxspltw         vs13,   vs45,   1
+       xxspltw         vs14,   vs45,   2
+       xxspltw         vs15,   vs45,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -962,51 +1001,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs46,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs46,   0
+       xxspltw         vs9,    vs46,   1
+       xxspltw         vs10,   vs46,   2
+       xxspltw         vs11,   vs46,   3
 
-       stxvw4x         vs47,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs47,   0
+       xxspltw         vs13,   vs47,   1
+       xxspltw         vs14,   vs47,   2
+       xxspltw         vs15,   vs47,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1033,51 +1073,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs48,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs48,   0
+       xxspltw         vs9,    vs48,   1
+       xxspltw         vs10,   vs48,   2
+       xxspltw         vs11,   vs48,   3
 
-       stxvw4x         vs49,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs49,   0
+       xxspltw         vs13,   vs49,   1
+       xxspltw         vs14,   vs49,   2
+       xxspltw         vs15,   vs49,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1098,51 +1139,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs50,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs50,   0
+       xxspltw         vs9,    vs50,   1
+       xxspltw         vs10,   vs50,   2
+       xxspltw         vs11,   vs50,   3
 
-       stxvw4x         vs51,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs51,   0
+       xxspltw         vs13,   vs51,   1
+       xxspltw         vs14,   vs51,   2
+       xxspltw         vs15,   vs51,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1163,51 +1205,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs52,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs52,   0
+       xxspltw         vs9,    vs52,   1
+       xxspltw         vs10,   vs52,   2
+       xxspltw         vs11,   vs52,   3
 
-       stxvw4x         vs53,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs53,   0
+       xxspltw         vs13,   vs53,   1
+       xxspltw         vs14,   vs53,   2
+       xxspltw         vs15,   vs53,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1228,51 +1271,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs54,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs54,   0
+       xxspltw         vs9,    vs54,   1
+       xxspltw         vs10,   vs54,   2
+       xxspltw         vs11,   vs54,   3
 
-       stxvw4x         vs55,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs55,   0
+       xxspltw         vs13,   vs55,   1
+       xxspltw         vs14,   vs55,   2
+       xxspltw         vs15,   vs55,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1299,51 +1343,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs56,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs56,   0
+       xxspltw         vs9,    vs56,   1
+       xxspltw         vs10,   vs56,   2
+       xxspltw         vs11,   vs56,   3
 
-       stxvw4x         vs57,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs57,   0
+       xxspltw         vs13,   vs57,   1
+       xxspltw         vs14,   vs57,   2
+       xxspltw         vs15,   vs57,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1364,51 +1409,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs58,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs58,   0
+       xxspltw         vs9,    vs58,   1
+       xxspltw         vs10,   vs58,   2
+       xxspltw         vs11,   vs58,   3
 
-       stxvw4x         vs59,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs59,   0
+       xxspltw         vs13,   vs59,   1
+       xxspltw         vs14,   vs59,   2
+       xxspltw         vs15,   vs59,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1429,51 +1475,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs60,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs60,   0
+       xxspltw         vs9,    vs60,   1
+       xxspltw         vs10,   vs60,   2
+       xxspltw         vs11,   vs60,   3
 
-       stxvw4x         vs61,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs61,   0
+       xxspltw         vs13,   vs61,   1
+       xxspltw         vs14,   vs61,   2
+       xxspltw         vs15,   vs61,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1494,51 +1541,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs62,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs62,   0
+       xxspltw         vs9,    vs62,   1
+       xxspltw         vs10,   vs62,   2
+       xxspltw         vs11,   vs62,   3
 
-       stxvw4x         vs63,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs63,   0
+       xxspltw         vs13,   vs63,   1
+       xxspltw         vs14,   vs63,   2
+       xxspltw         vs15,   vs63,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1886,51 +1934,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -1951,51 +2000,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2022,51 +2072,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs36,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
 
-       stxvw4x         vs37,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2087,51 +2138,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2158,51 +2210,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs40,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs40,   0
+       xxspltw         vs9,    vs40,   1
+       xxspltw         vs10,   vs40,   2
+       xxspltw         vs11,   vs40,   3
 
-       stxvw4x         vs41,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs41,   0
+       xxspltw         vs13,   vs41,   1
+       xxspltw         vs14,   vs41,   2
+       xxspltw         vs15,   vs41,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2223,51 +2276,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs42,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs42,   0
+       xxspltw         vs9,    vs42,   1
+       xxspltw         vs10,   vs42,   2
+       xxspltw         vs11,   vs42,   3
 
-       stxvw4x         vs43,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs43,   0
+       xxspltw         vs13,   vs43,   1
+       xxspltw         vs14,   vs43,   2
+       xxspltw         vs15,   vs43,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2294,51 +2348,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs44,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs44,   0
+       xxspltw         vs9,    vs44,   1
+       xxspltw         vs10,   vs44,   2
+       xxspltw         vs11,   vs44,   3
 
-       stxvw4x         vs45,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs45,   0
+       xxspltw         vs13,   vs45,   1
+       xxspltw         vs14,   vs45,   2
+       xxspltw         vs15,   vs45,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2359,51 +2414,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs46,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs46,   0
+       xxspltw         vs9,    vs46,   1
+       xxspltw         vs10,   vs46,   2
+       xxspltw         vs11,   vs46,   3
 
-       stxvw4x         vs47,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs47,   0
+       xxspltw         vs13,   vs47,   1
+       xxspltw         vs14,   vs47,   2
+       xxspltw         vs15,   vs47,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2691,51 +2747,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2762,51 +2819,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2833,51 +2891,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs36,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
 
-       stxvw4x         vs37,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -2904,51 +2963,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -3028,25 +3088,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     32
 
 
-       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
 
-       xsmulsp         vs40,   vs0,    vs12            // a0_r*b2_r
-       xsmulsp         vs41,   vs1,    vs13            // a0_i*b2_i
-       xsmulsp         vs42,   vs0,    vs13            // a0_r*b2_i
-       xsmulsp         vs43,   vs1,    vs12            // a0_i*b2_r
+       xsmuldp         vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmuldp         vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmuldp         vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmuldp         vs43,   vs1,    vs12            // a0_i*b2_r
 
-       xsmulsp         vs44,   vs0,    vs14            // a0_r*b3_r
-       xsmulsp         vs45,   vs1,    vs15            // a0_i*b3_i
-       xsmulsp         vs46,   vs0,    vs15            // a0_r*b3_i
-       xsmulsp         vs47,   vs1,    vs14            // a0_i*b3_r
+       xsmuldp         vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmuldp         vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmuldp         vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmuldp         vs47,   vs1,    vs14            // a0_i*b3_r
 
 
 .endm
@@ -3082,25 +3142,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
 
-       xsmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r
-       xsmaddasp       vs41,   vs1,    vs13            // a0_i*b2_i
-       xsmaddasp       vs42,   vs0,    vs13            // a0_r*b2_i
-       xsmaddasp       vs43,   vs1,    vs12            // a0_i*b2_r
+       xsmaddadp       vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmaddadp       vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmaddadp       vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmaddadp       vs43,   vs1,    vs12            // a0_i*b2_r
 
-       xsmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r
-       xsmaddasp       vs45,   vs1,    vs15            // a0_i*b3_i
-       xsmaddasp       vs46,   vs0,    vs15            // a0_r*b3_i
-       xsmaddasp       vs47,   vs1,    vs14            // a0_i*b3_r
+       xsmaddadp       vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmaddadp       vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmaddadp       vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmaddadp       vs47,   vs1,    vs14            // a0_i*b3_r
 
 
 .endm
@@ -3136,25 +3196,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
-       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
-       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
-       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
 
-       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
-       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
-       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
-       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
 
-       xsmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r
-       xsmaddasp       vs41,   vs5,    vs21            // a4_i*b2_i
-       xsmaddasp       vs42,   vs4,    vs21            // a4_r*b2_i
-       xsmaddasp       vs43,   vs5,    vs20            // a4_i*b2_r
+       xsmaddadp       vs40,   vs4,    vs20            // a4_r*b2_r
+       xsmaddadp       vs41,   vs5,    vs21            // a4_i*b2_i
+       xsmaddadp       vs42,   vs4,    vs21            // a4_r*b2_i
+       xsmaddadp       vs43,   vs5,    vs20            // a4_i*b2_r
 
-       xsmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r
-       xsmaddasp       vs45,   vs5,    vs23            // a4_i*b3_i
-       xsmaddasp       vs46,   vs4,    vs23            // a4_r*b3_i
-       xsmaddasp       vs47,   vs5,    vs22            // a4_i*b3_r
+       xsmaddadp       vs44,   vs4,    vs22            // a4_r*b3_r
+       xsmaddadp       vs45,   vs5,    vs23            // a4_i*b3_i
+       xsmaddadp       vs46,   vs4,    vs23            // a4_r*b3_i
+       xsmaddadp       vs47,   vs5,    vs22            // a4_i*b3_r
 
 
 .endm
@@ -3162,25 +3222,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL4x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
-       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
-       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
-       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
 
-       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
-       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
-       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
-       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
 
-       xsmaddasp       vs40,   vs4,    vs20            // a4_r*b2_r
-       xsmaddasp       vs41,   vs5,    vs21            // a4_i*b2_i
-       xsmaddasp       vs42,   vs4,    vs21            // a4_r*b2_i
-       xsmaddasp       vs43,   vs5,    vs20            // a4_i*b2_r
+       xsmaddadp       vs40,   vs4,    vs20            // a4_r*b2_r
+       xsmaddadp       vs41,   vs5,    vs21            // a4_i*b2_i
+       xsmaddadp       vs42,   vs4,    vs21            // a4_r*b2_i
+       xsmaddadp       vs43,   vs5,    vs20            // a4_i*b2_r
 
-       xsmaddasp       vs44,   vs4,    vs22            // a4_r*b3_r
-       xsmaddasp       vs45,   vs5,    vs23            // a4_i*b3_i
-       xsmaddasp       vs46,   vs4,    vs23            // a4_r*b3_i
-       xsmaddasp       vs47,   vs5,    vs22            // a4_i*b3_r
+       xsmaddadp       vs44,   vs4,    vs22            // a4_r*b3_r
+       xsmaddadp       vs45,   vs5,    vs23            // a4_i*b3_i
+       xsmaddadp       vs46,   vs4,    vs23            // a4_r*b3_i
+       xsmaddadp       vs47,   vs5,    vs22            // a4_i*b3_r
 
 
 .endm
@@ -3216,25 +3276,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     32
 
 
-       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
 
-       xsmulsp         vs40,   vs0,    vs12            // a0_r*b2_r
-       xsmulsp         vs41,   vs1,    vs13            // a0_i*b2_i
-       xsmulsp         vs42,   vs0,    vs13            // a0_r*b2_i
-       xsmulsp         vs43,   vs1,    vs12            // a0_i*b2_r
+       xsmuldp         vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmuldp         vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmuldp         vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmuldp         vs43,   vs1,    vs12            // a0_i*b2_r
 
-       xsmulsp         vs44,   vs0,    vs14            // a0_r*b3_r
-       xsmulsp         vs45,   vs1,    vs15            // a0_i*b3_i
-       xsmulsp         vs46,   vs0,    vs15            // a0_r*b3_i
-       xsmulsp         vs47,   vs1,    vs14            // a0_i*b3_r
+       xsmuldp         vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmuldp         vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmuldp         vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmuldp         vs47,   vs1,    vs14            // a0_i*b3_r
 
 
 .endm
@@ -3270,25 +3330,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     32
 
 
-       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
 
-       xsmaddasp       vs40,   vs0,    vs12            // a0_r*b2_r
-       xsmaddasp       vs41,   vs1,    vs13            // a0_i*b2_i
-       xsmaddasp       vs42,   vs0,    vs13            // a0_r*b2_i
-       xsmaddasp       vs43,   vs1,    vs12            // a0_i*b2_r
+       xsmaddadp       vs40,   vs0,    vs12            // a0_r*b2_r
+       xsmaddadp       vs41,   vs1,    vs13            // a0_i*b2_i
+       xsmaddadp       vs42,   vs0,    vs13            // a0_r*b2_i
+       xsmaddadp       vs43,   vs1,    vs12            // a0_i*b2_r
 
-       xsmaddasp       vs44,   vs0,    vs14            // a0_r*b3_r
-       xsmaddasp       vs45,   vs1,    vs15            // a0_i*b3_i
-       xsmaddasp       vs46,   vs0,    vs15            // a0_r*b3_i
-       xsmaddasp       vs47,   vs1,    vs14            // a0_i*b3_r
+       xsmaddadp       vs44,   vs0,    vs14            // a0_r*b3_r
+       xsmaddadp       vs45,   vs1,    vs15            // a0_i*b3_i
+       xsmaddadp       vs46,   vs0,    vs15            // a0_r*b3_i
+       xsmaddadp       vs47,   vs1,    vs14            // a0_i*b3_r
 
 
 .endm
@@ -3320,16 +3380,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
@@ -3362,16 +3422,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs37            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs38            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
@@ -3404,16 +3464,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs41            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs42            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
@@ -3446,16 +3506,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs45            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs46            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
@@ -3773,51 +3833,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -3838,51 +3899,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -3903,51 +3965,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs36,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
 
-       stxvw4x         vs37,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -3968,51 +4031,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4039,51 +4103,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs40,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs40,   0
+       xxspltw         vs9,    vs40,   1
+       xxspltw         vs10,   vs40,   2
+       xxspltw         vs11,   vs40,   3
 
-       stxvw4x         vs41,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs41,   0
+       xxspltw         vs13,   vs41,   1
+       xxspltw         vs14,   vs41,   2
+       xxspltw         vs15,   vs41,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4104,51 +4169,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs42,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs42,   0
+       xxspltw         vs9,    vs42,   1
+       xxspltw         vs10,   vs42,   2
+       xxspltw         vs11,   vs42,   3
 
-       stxvw4x         vs43,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs43,   0
+       xxspltw         vs13,   vs43,   1
+       xxspltw         vs14,   vs43,   2
+       xxspltw         vs15,   vs43,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4169,51 +4235,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs44,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs44,   0
+       xxspltw         vs9,    vs44,   1
+       xxspltw         vs10,   vs44,   2
+       xxspltw         vs11,   vs44,   3
 
-       stxvw4x         vs45,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs45,   0
+       xxspltw         vs13,   vs45,   1
+       xxspltw         vs14,   vs45,   2
+       xxspltw         vs15,   vs45,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4234,51 +4301,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs46,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs46,   0
+       xxspltw         vs9,    vs46,   1
+       xxspltw         vs10,   vs46,   2
+       xxspltw         vs11,   vs46,   3
 
-       stxvw4x         vs47,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs47,   0
+       xxspltw         vs13,   vs47,   1
+       xxspltw         vs14,   vs47,   2
+       xxspltw         vs15,   vs47,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4524,51 +4592,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4589,51 +4658,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4660,51 +4730,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs36,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
 
-       stxvw4x         vs37,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4725,51 +4796,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -4979,51 +5051,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -5050,51 +5123,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -5154,15 +5228,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     16
 
 
-       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
 
 
 .endm
@@ -5188,15 +5262,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
 
 
 .endm
@@ -5222,15 +5296,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
-       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
-       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
-       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
 
-       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
-       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
-       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
-       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
 
 
 .endm
@@ -5238,15 +5312,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL2x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
-       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
-       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
-       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
 
-       xsmaddasp       vs36,   vs4,    vs18            // a4_r*b1_r
-       xsmaddasp       vs37,   vs5,    vs19            // a4_i*b1_i
-       xsmaddasp       vs38,   vs4,    vs19            // a4_r*b1_i
-       xsmaddasp       vs39,   vs5,    vs18            // a4_i*b1_r
+       xsmaddadp       vs36,   vs4,    vs18            // a4_r*b1_r
+       xsmaddadp       vs37,   vs5,    vs19            // a4_i*b1_i
+       xsmaddadp       vs38,   vs4,    vs19            // a4_r*b1_i
+       xsmaddadp       vs39,   vs5,    vs18            // a4_i*b1_r
 
 
 .endm
@@ -5272,15 +5346,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     16
 
 
-       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmulsp         vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmulsp         vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmulsp         vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmulsp         vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmuldp         vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmuldp         vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmuldp         vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmuldp         vs39,   vs1,    vs10            // a0_i*b1_r
 
 
 .endm
@@ -5306,15 +5380,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     16
 
 
-       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
 
-       xsmaddasp       vs36,   vs0,    vs10            // a0_r*b1_r
-       xsmaddasp       vs37,   vs1,    vs11            // a0_i*b1_i
-       xsmaddasp       vs38,   vs0,    vs11            // a0_r*b1_i
-       xsmaddasp       vs39,   vs1,    vs10            // a0_i*b1_r
+       xsmaddadp       vs36,   vs0,    vs10            // a0_r*b1_r
+       xsmaddadp       vs37,   vs1,    vs11            // a0_i*b1_i
+       xsmaddadp       vs38,   vs0,    vs11            // a0_r*b1_i
+       xsmaddadp       vs39,   vs1,    vs10            // a0_i*b1_r
 
 
 .endm
@@ -5346,16 +5420,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
@@ -5388,16 +5462,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs37            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs38            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
@@ -5673,51 +5747,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -5738,51 +5813,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -5803,51 +5879,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs36,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs36,   0
+       xxspltw         vs9,    vs36,   1
+       xxspltw         vs10,   vs36,   2
+       xxspltw         vs11,   vs36,   3
 
-       stxvw4x         vs37,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs37,   0
+       xxspltw         vs13,   vs37,   1
+       xxspltw         vs14,   vs37,   2
+       xxspltw         vs15,   vs37,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -5868,51 +5945,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs38,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs38,   0
+       xxspltw         vs9,    vs38,   1
+       xxspltw         vs10,   vs38,   2
+       xxspltw         vs11,   vs38,   3
 
-       stxvw4x         vs39,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs39,   0
+       xxspltw         vs13,   vs39,   1
+       xxspltw         vs14,   vs39,   2
+       xxspltw         vs15,   vs39,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -6140,51 +6218,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -6205,51 +6284,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs34,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs34,   0
+       xxspltw         vs9,    vs34,   1
+       xxspltw         vs10,   vs34,   2
+       xxspltw         vs11,   vs34,   3
 
-       stxvw4x         vs35,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs35,   0
+       xxspltw         vs13,   vs35,   1
+       xxspltw         vs14,   vs35,   2
+       xxspltw         vs15,   vs35,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -6453,51 +6533,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        xxlxor          vs0,    vs0,    vs0
 #endif
 
-       stxvw4x         vs32,   o0,     TBUFFER
 
-       lxsspx          vs8,    o0,     TBUFFER
-       lxsspx          vs9,    o4,     TBUFFER
-       lxsspx          vs10,   o8,     TBUFFER
-       lxsspx          vs11,   o12,    TBUFFER
+       xxspltw         vs8,    vs32,   0
+       xxspltw         vs9,    vs32,   1
+       xxspltw         vs10,   vs32,   2
+       xxspltw         vs11,   vs32,   3
 
-       stxvw4x         vs33,   o0,     TBUFFER
 
-       lxsspx          vs12,   o0,     TBUFFER
-       lxsspx          vs13,   o4,     TBUFFER
-       lxsspx          vs14,   o8,     TBUFFER
-       lxsspx          vs15,   o12,    TBUFFER
+       xxspltw         vs12,   vs33,   0
+       xxspltw         vs13,   vs33,   1
+       xxspltw         vs14,   vs33,   2
+       xxspltw         vs15,   vs33,   3
 
-       XSFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
-       XSFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
-       XSFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
-       XSFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
+       XVFADD_R1       vs4,    vs4,    vs8             // add a0_r * b0_r
+       XVFADD_I2       vs5,    vs5,    vs12            // add a0_r * b0_i
+       XVFADD_R1       vs6,    vs6,    vs10            // add a1_r * b0_r
+       XVFADD_I2       vs7,    vs7,    vs14            // add a1_r * b0_i
 
-       XSFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
-       XSFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
-       XSFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
-       XSFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
+       XVFADD_R2       vs4,    vs4,    vs13            // add a0_i * b0_i
+       XVFADD_I1       vs5,    vs5,    vs9             // add a0_i * b0_r
+       XVFADD_R2       vs6,    vs6,    vs15            // add a1_i * b0_i
+       XVFADD_I1       vs7,    vs7,    vs11            // add a1_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xvmulsp         vs16,   vs4,    alpha_sr                // r0_r * alpha_r
+       xvmulsp         vs17,   vs5,    alpha_si                // r0_i * alpha_i
+       xvmulsp         vs18,   vs4,    alpha_si                // r0_r * alpha_i
+       xvmulsp         vs19,   vs5,    alpha_sr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xvsubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xvaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsmulsp         vs16,   vs6,    alpha_r         // r1_r * alpha_r
-       xsmulsp         vs17,   vs7,    alpha_i         // r1_i * alpha_i
-       xsmulsp         vs18,   vs6,    alpha_i         // r1_r * alpha_i
-       xsmulsp         vs19,   vs7,    alpha_r         // r1_i * alpha_r
+       xvmulsp         vs16,   vs6,    alpha_sr                // r1_r * alpha_r
+       xvmulsp         vs17,   vs7,    alpha_si                // r1_i * alpha_i
+       xvmulsp         vs18,   vs6,    alpha_si                // r1_r * alpha_i
+       xvmulsp         vs19,   vs7,    alpha_sr                // r1_i * alpha_r
 
-       xssubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
-       xsaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
+       xvsubsp         vs22,   vs16,   vs17            // r1_r * alpha_r - r1_i * alpha_i
+       xvaddsp         vs23,   vs18,   vs19            // r1_r * alpha_i + r1_i * alpha_r
 
-       stxsspx         vs20,   o0,     TBUFFER         // store r0_r
-       stxsspx         vs21,   o4,     TBUFFER         // store r0_i
-       stxsspx         vs22,   o8,     TBUFFER         // store r1_r
-       stxsspx         vs23,   o12,    TBUFFER         // store r1_i
-       lxvw4x          vs1,    o0,     TBUFFER                 // load r0_r, r0_i, r1_r, r1_i
+       xxlxor          vs24,   vs24,   vs24
+       xxsldwi         vs20,   vs20,   vs24,   3               // r0_r
+       xxsldwi         vs21,   vs21,   vs24,   2               // r0_i
+       xxsldwi         vs22,   vs22,   vs24,   1               // r1_r
+       xxsldwi         vs23,   vs23,   vs24,   0               // r1_i
+       xvaddsp         vs20,   vs20,   vs21            // r0_r, r0_i
+       xvaddsp         vs22,   vs22,   vs23            // r1_r, r1_i
+       xvaddsp         vs1,    vs20,   vs22                    // r0_r, r0_i, r1_r, r1_i
        xvaddsp         vs0,    vs0,    vs1
 
 
@@ -6547,10 +6628,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     8
 
 
-       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
 
 
 .endm
@@ -6571,10 +6652,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     8
 
 
-       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
 
 
 .endm
@@ -6595,10 +6676,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     8
 
 
-       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
-       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
-       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
-       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
 
 
 .endm
@@ -6606,10 +6687,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL1x1_E2
 
 
-       xsmaddasp       vs32,   vs4,    vs16            // a4_r*b0_r
-       xsmaddasp       vs33,   vs5,    vs17            // a4_i*b0_i
-       xsmaddasp       vs34,   vs4,    vs17            // a4_r*b0_i
-       xsmaddasp       vs35,   vs5,    vs16            // a4_i*b0_r
+       xsmaddadp       vs32,   vs4,    vs16            // a4_r*b0_r
+       xsmaddadp       vs33,   vs5,    vs17            // a4_i*b0_i
+       xsmaddadp       vs34,   vs4,    vs17            // a4_r*b0_i
+       xsmaddadp       vs35,   vs5,    vs16            // a4_i*b0_r
 
 
 .endm
@@ -6630,10 +6711,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     8
 
 
-       xsmulsp         vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmulsp         vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmulsp         vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmulsp         vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmuldp         vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmuldp         vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmuldp         vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmuldp         vs35,   vs1,    vs8             // a0_i*b0_r
 
 
 .endm
@@ -6654,10 +6735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi            BO,     BO,     8
 
 
-       xsmaddasp       vs32,   vs0,    vs8             // a0_r*b0_r
-       xsmaddasp       vs33,   vs1,    vs9             // a0_i*b0_i
-       xsmaddasp       vs34,   vs0,    vs9             // a0_r*b0_i
-       xsmaddasp       vs35,   vs1,    vs8             // a0_i*b0_r
+       xsmaddadp       vs32,   vs0,    vs8             // a0_r*b0_r
+       xsmaddadp       vs33,   vs1,    vs9             // a0_i*b0_i
+       xsmaddadp       vs34,   vs0,    vs9             // a0_r*b0_i
+       xsmaddadp       vs35,   vs1,    vs8             // a0_i*b0_r
 
 
 .endm
@@ -6689,16 +6770,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        XSFADD_R2       vs4,    vs4,    vs33            // add a0_i * b0_i
        XSFADD_I2       vs5,    vs5,    vs34            // add a0_i * b0_r
 
-       xsmulsp         vs16,   vs4,    alpha_r         // r0_r * alpha_r
-       xsmulsp         vs17,   vs5,    alpha_i         // r0_i * alpha_i
-       xsmulsp         vs18,   vs4,    alpha_i         // r0_r * alpha_i
-       xsmulsp         vs19,   vs5,    alpha_r         // r0_i * alpha_r
+       xsmuldp         vs16,   vs4,    alpha_dr                // r0_r * alpha_r
+       xsmuldp         vs17,   vs5,    alpha_di                // r0_i * alpha_i
+       xsmuldp         vs18,   vs4,    alpha_di                // r0_r * alpha_i
+       xsmuldp         vs19,   vs5,    alpha_dr                // r0_i * alpha_r
 
-       xssubsp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
-       xsaddsp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
+       xssubdp         vs20,   vs16,   vs17            // r0_r * alpha_r - r0_i * alpha_i
+       xsadddp         vs21,   vs18,   vs19            // r0_r * alpha_i + r0_i * alpha_r
 
-       xsaddsp         vs0,    vs0,    vs20
-       xsaddsp         vs1,    vs1,    vs21
+       xsadddp         vs0,    vs0,    vs20
+       xsadddp         vs1,    vs1,    vs21
 
 
        stxsspx         vs0,    o0,     T2      // store c0_r
index b154857..b202114 100644 (file)
@@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
 /*********************************************************************/
@@ -129,18 +129,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define o0     0
-#define alpha_r vs30
-#define alpha_i vs31
-#define alpha_vr vs28
-#define alpha_vi vs29
 
+#define alpha_dr vs28
+#define alpha_di vs29
+#define alpha_sr vs30
+#define alpha_si vs31
 
 #define o12    r12
 #define KKK    r13
 #define K1     r14
 #define L      r15
 #define o16    r16
-#define TBUFFER        r17
+#define NOTUSED        r17
 #define T2     r19
 #define KK     r20
 #define        o8      r21
@@ -278,21 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cgemm_macros_8x4_power8.S"
 
        cmpwi   cr0, M, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, N, 0
-       ble     .L999_H1
+       ble     L999_H1
        cmpwi   cr0, K, 0
-       ble     .L999_H1
+       ble     L999_H1
 
         slwi    LDC, LDC, ZBASE_SHIFT
-        li      PRE, 256
+        li      PRE, 384
         li      o4  , 4
         li      o8  , 8
         li      o12 , 12
         li      o16 , 16
         li      o32 , 32
         li      o48 , 48
-       addi    TBUFFER, SP, 360
 
 
 #ifdef __64BIT__
@@ -301,14 +300,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        addi    T1, SP, 224
 #endif
 
-       lxsspx  alpha_r, 0, T1
-       lxsspx  alpha_i, o8, T1
+        lxsspx  alpha_dr, 0, T1
+        lxsspx  alpha_di, o8, T1
+        addi    T1, SP, 360
+        li      T2, 0
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_dr, o12, T1
+        lxvw4x          alpha_sr, o0 , T1
+        addi            T1, T1, 16
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_di, o12, T1
+        lxvw4x          alpha_si, o0 , T1
 
        .align 5
 
 #include "ctrmm_logic_8x4_power8.S"
 
-.L999:
+L999:
        addi    r3, 0, 0
 
        lfd     f14,    0(SP)
index f9656e9..3e50646 100644 (file)
@@ -26,18 +26,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /**************************************************************************************
-* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
+* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
 *       BLASTEST               : OK
 *       CTEST                  : OK
 *       TEST                   : OK
-*       LAPACK-TEST            : OK
+*       LAPACK-TEST            : OK
 **************************************************************************************/
 
-
        srawi.          J,      N,      2
-       ble             .LCTRMM_L4_END
+       ble             CTRMM_L4_END
 
-.LCTRMM_L4_BEGIN:
+CTRMM_L4_BEGIN:
 
        mr              CO,     C
        mr              AO,     A
@@ -49,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      3
-       ble             .LCTRMM_L4x8_END
+       ble             CTRMM_L4x8_END
 
-.LCTRMM_L4x8_BEGIN:
+CTRMM_L4x8_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -78,11 +77,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L4x8_SUB0
+       ble             CTRMM_L4x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L4x8_SUB4
+       ble             CTRMM_L4x8_SUB4
 
-.LCTRMM_L4x8_LOOP_START:
+CTRMM_L4x8_LOOP_START:
 
        LOAD4x8_1
        KERNEL4x8_I1
@@ -96,11 +95,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L4x8_LOOP_END
+       ble             CTRMM_L4x8_LOOP_END
 
        .align 5
 
-.LCTRMM_L4x8_LOOP:
+CTRMM_L4x8_LOOP:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -113,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x8_LOOP
+       bgt             CTRMM_L4x8_LOOP
 
-.LCTRMM_L4x8_LOOP_END:
+CTRMM_L4x8_LOOP_END:
 
        KERNEL4x8_1
        KERNEL4x8_2
@@ -127,9 +126,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_1
        KERNEL4x8_E2
 
-       b               .LCTRMM_L4x8_SUB1
+       b               CTRMM_L4x8_SUB1
 
-.LCTRMM_L4x8_SUB4:
+CTRMM_L4x8_SUB4:
 
        KERNEL4x8_SUBI1
        KERNEL4x8_SUB1
@@ -141,31 +140,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x8_SUB1
        KERNEL4x8_SUB1
 
-       b               .LCTRMM_L4x8_SUB1
+       b               CTRMM_L4x8_SUB1
 
-.LCTRMM_L4x8_SUB0:
+CTRMM_L4x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L4x8_SAVE
-       b               .LCTRMM_L4x8_SUB2
+       ble             CTRMM_L4x8_SAVE
+       b               CTRMM_L4x8_SUB2
 
-.LCTRMM_L4x8_SUB1:
+CTRMM_L4x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L4x8_SAVE
+       ble             CTRMM_L4x8_SAVE
 
-.LCTRMM_L4x8_SUB2:
+CTRMM_L4x8_SUB2:
 
        KERNEL4x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x8_SUB2
+       bgt             CTRMM_L4x8_SUB2
 
-.LCTRMM_L4x8_SAVE:
+CTRMM_L4x8_SAVE:
 
        SAVE4x8
 
@@ -183,16 +182,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LCTRMM_L4x8_BEGIN
+       bgt             CTRMM_L4x8_BEGIN
 
-.LCTRMM_L4x8_END:
+CTRMM_L4x8_END:
 
-.LCTRMM_L4x4_BEGIN:
+CTRMM_L4x4_BEGIN:
        andi.           T2,     M,      7
-       ble             .LCTRMM_L4x1_END
+       ble             CTRMM_L4x1_END
 
        andi.           T1,     M,      4
-       ble             .LCTRMM_L4x4_END
+       ble             CTRMM_L4x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -218,11 +217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L4x4_SUB0
+       ble             CTRMM_L4x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L4x4_SUB4
+       ble             CTRMM_L4x4_SUB4
 
-.LCTRMM_L4x4_LOOP_START:
+CTRMM_L4x4_LOOP_START:
 
        LOAD4x4_1
        KERNEL4x4_I1
@@ -236,11 +235,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L4x4_LOOP_END
+       ble             CTRMM_L4x4_LOOP_END
 
        .align 5
 
-.LCTRMM_L4x4_LOOP:
+CTRMM_L4x4_LOOP:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -253,9 +252,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x4_LOOP
+       bgt             CTRMM_L4x4_LOOP
 
-.LCTRMM_L4x4_LOOP_END:
+CTRMM_L4x4_LOOP_END:
 
        KERNEL4x4_1
        KERNEL4x4_2
@@ -267,9 +266,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_1
        KERNEL4x4_E2
 
-       b               .LCTRMM_L4x4_SUB1
+       b               CTRMM_L4x4_SUB1
 
-.LCTRMM_L4x4_SUB4:
+CTRMM_L4x4_SUB4:
 
        KERNEL4x4_SUBI1
        KERNEL4x4_SUB1
@@ -281,31 +280,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x4_SUB1
        KERNEL4x4_SUB1
 
-       b               .LCTRMM_L4x4_SUB1
+       b               CTRMM_L4x4_SUB1
 
-.LCTRMM_L4x4_SUB0:
+CTRMM_L4x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L4x4_SAVE
-       b               .LCTRMM_L4x4_SUB2
+       ble             CTRMM_L4x4_SAVE
+       b               CTRMM_L4x4_SUB2
 
-.LCTRMM_L4x4_SUB1:
+CTRMM_L4x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L4x4_SAVE
+       ble             CTRMM_L4x4_SAVE
 
-.LCTRMM_L4x4_SUB2:
+CTRMM_L4x4_SUB2:
 
        KERNEL4x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x4_SUB2
+       bgt             CTRMM_L4x4_SUB2
 
-.LCTRMM_L4x4_SAVE:
+CTRMM_L4x4_SAVE:
 
        SAVE4x4
 
@@ -322,12 +321,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L4x4_END:
+CTRMM_L4x4_END:
 
-.LCTRMM_L4x2_BEGIN:
+CTRMM_L4x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LCTRMM_L4x2_END
+       ble             CTRMM_L4x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -353,11 +352,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L4x2_SUB0
+       ble             CTRMM_L4x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L4x2_SUB4
+       ble             CTRMM_L4x2_SUB4
 
-.LCTRMM_L4x2_LOOP_START:
+CTRMM_L4x2_LOOP_START:
 
        LOAD4x2_1
        KERNEL4x2_I1
@@ -371,11 +370,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L4x2_LOOP_END
+       ble             CTRMM_L4x2_LOOP_END
 
        .align 5
 
-.LCTRMM_L4x2_LOOP:
+CTRMM_L4x2_LOOP:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -388,9 +387,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x2_LOOP
+       bgt             CTRMM_L4x2_LOOP
 
-.LCTRMM_L4x2_LOOP_END:
+CTRMM_L4x2_LOOP_END:
 
        KERNEL4x2_1
        KERNEL4x2_2
@@ -402,9 +401,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_1
        KERNEL4x2_E2
 
-       b               .LCTRMM_L4x2_SUB1
+       b               CTRMM_L4x2_SUB1
 
-.LCTRMM_L4x2_SUB4:
+CTRMM_L4x2_SUB4:
 
        KERNEL4x2_SUBI1
        KERNEL4x2_SUB1
@@ -416,31 +415,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x2_SUB1
        KERNEL4x2_SUB1
 
-       b               .LCTRMM_L4x2_SUB1
+       b               CTRMM_L4x2_SUB1
 
-.LCTRMM_L4x2_SUB0:
+CTRMM_L4x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L4x2_SAVE
-       b               .LCTRMM_L4x2_SUB2
+       ble             CTRMM_L4x2_SAVE
+       b               CTRMM_L4x2_SUB2
 
-.LCTRMM_L4x2_SUB1:
+CTRMM_L4x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L4x2_SAVE
+       ble             CTRMM_L4x2_SAVE
 
-.LCTRMM_L4x2_SUB2:
+CTRMM_L4x2_SUB2:
 
        KERNEL4x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x2_SUB2
+       bgt             CTRMM_L4x2_SUB2
 
-.LCTRMM_L4x2_SAVE:
+CTRMM_L4x2_SAVE:
 
        SAVE4x2
 
@@ -457,12 +456,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L4x2_END:
+CTRMM_L4x2_END:
 
-.LCTRMM_L4x1_BEGIN:
+CTRMM_L4x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LCTRMM_L4x1_END
+       ble             CTRMM_L4x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -488,11 +487,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L4x1_SUB0
+       ble             CTRMM_L4x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L4x1_SUB4
+       ble             CTRMM_L4x1_SUB4
 
-.LCTRMM_L4x1_LOOP_START:
+CTRMM_L4x1_LOOP_START:
 
        LOAD4x1_1
        KERNEL4x1_I1
@@ -506,11 +505,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L4x1_LOOP_END
+       ble             CTRMM_L4x1_LOOP_END
 
        .align 5
 
-.LCTRMM_L4x1_LOOP:
+CTRMM_L4x1_LOOP:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -523,9 +522,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x1_LOOP
+       bgt             CTRMM_L4x1_LOOP
 
-.LCTRMM_L4x1_LOOP_END:
+CTRMM_L4x1_LOOP_END:
 
        KERNEL4x1_1
        KERNEL4x1_2
@@ -537,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_1
        KERNEL4x1_E2
 
-       b               .LCTRMM_L4x1_SUB1
+       b               CTRMM_L4x1_SUB1
 
-.LCTRMM_L4x1_SUB4:
+CTRMM_L4x1_SUB4:
 
        KERNEL4x1_SUBI1
        KERNEL4x1_SUB1
@@ -551,31 +550,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL4x1_SUB1
        KERNEL4x1_SUB1
 
-       b               .LCTRMM_L4x1_SUB1
+       b               CTRMM_L4x1_SUB1
 
-.LCTRMM_L4x1_SUB0:
+CTRMM_L4x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL4x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L4x1_SAVE
-       b               .LCTRMM_L4x1_SUB2
+       ble             CTRMM_L4x1_SAVE
+       b               CTRMM_L4x1_SUB2
 
-.LCTRMM_L4x1_SUB1:
+CTRMM_L4x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L4x1_SAVE
+       ble             CTRMM_L4x1_SAVE
 
-.LCTRMM_L4x1_SUB2:
+CTRMM_L4x1_SUB2:
 
        KERNEL4x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L4x1_SUB2
+       bgt             CTRMM_L4x1_SUB2
 
-.LCTRMM_L4x1_SAVE:
+CTRMM_L4x1_SAVE:
 
        SAVE4x1
 
@@ -592,7 +591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L4x1_END:
+CTRMM_L4x1_END:
 
        slwi            T1,     K,      5
        add             B,      B,      T1
@@ -603,23 +602,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          J,      J,      -1
-       bgt             .LCTRMM_L4_BEGIN
+       bgt             CTRMM_L4_BEGIN
 
        andi.           T2,     N,      3
-       ble             .L999_H2
+       ble             L999_H2
 
-.LCTRMM_L4_END:
+CTRMM_L4_END:
 
-       b               .LCTRMM_L2_BEGIN
+       b               CTRMM_L2_BEGIN
 
-.L999_H1:
+L999_H1:
 
-       b               .L999_H2
+       b               L999_H2
 
-.LCTRMM_L2_BEGIN:
+CTRMM_L2_BEGIN:
 
        andi.           T1,     N,      2
-       ble             .LCTRMM_L2_END
+       ble             CTRMM_L2_END
        mr              CO,     C
        mr              AO,     A
        slwi            T1,     LDC     ,       1
@@ -630,9 +629,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      3
-       ble             .LCTRMM_L2x8_END
+       ble             CTRMM_L2x8_END
 
-.LCTRMM_L2x8_BEGIN:
+CTRMM_L2x8_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -659,11 +658,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L2x8_SUB0
+       ble             CTRMM_L2x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L2x8_SUB4
+       ble             CTRMM_L2x8_SUB4
 
-.LCTRMM_L2x8_LOOP_START:
+CTRMM_L2x8_LOOP_START:
 
        LOAD2x8_1
        KERNEL2x8_I1
@@ -677,11 +676,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L2x8_LOOP_END
+       ble             CTRMM_L2x8_LOOP_END
 
        .align 5
 
-.LCTRMM_L2x8_LOOP:
+CTRMM_L2x8_LOOP:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -694,9 +693,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x8_LOOP
+       bgt             CTRMM_L2x8_LOOP
 
-.LCTRMM_L2x8_LOOP_END:
+CTRMM_L2x8_LOOP_END:
 
        KERNEL2x8_1
        KERNEL2x8_2
@@ -708,9 +707,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_1
        KERNEL2x8_E2
 
-       b               .LCTRMM_L2x8_SUB1
+       b               CTRMM_L2x8_SUB1
 
-.LCTRMM_L2x8_SUB4:
+CTRMM_L2x8_SUB4:
 
        KERNEL2x8_SUBI1
        KERNEL2x8_SUB1
@@ -722,31 +721,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x8_SUB1
        KERNEL2x8_SUB1
 
-       b               .LCTRMM_L2x8_SUB1
+       b               CTRMM_L2x8_SUB1
 
-.LCTRMM_L2x8_SUB0:
+CTRMM_L2x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L2x8_SAVE
-       b               .LCTRMM_L2x8_SUB2
+       ble             CTRMM_L2x8_SAVE
+       b               CTRMM_L2x8_SUB2
 
-.LCTRMM_L2x8_SUB1:
+CTRMM_L2x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L2x8_SAVE
+       ble             CTRMM_L2x8_SAVE
 
-.LCTRMM_L2x8_SUB2:
+CTRMM_L2x8_SUB2:
 
        KERNEL2x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x8_SUB2
+       bgt             CTRMM_L2x8_SUB2
 
-.LCTRMM_L2x8_SAVE:
+CTRMM_L2x8_SAVE:
 
        SAVE2x8
 
@@ -764,16 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LCTRMM_L2x8_BEGIN
+       bgt             CTRMM_L2x8_BEGIN
 
-.LCTRMM_L2x8_END:
+CTRMM_L2x8_END:
 
-.LCTRMM_L2x4_BEGIN:
+CTRMM_L2x4_BEGIN:
        andi.           T2,     M,      7
-       ble             .LCTRMM_L2x1_END
+       ble             CTRMM_L2x1_END
 
        andi.           T1,     M,      4
-       ble             .LCTRMM_L2x4_END
+       ble             CTRMM_L2x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -799,11 +798,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L2x4_SUB0
+       ble             CTRMM_L2x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L2x4_SUB4
+       ble             CTRMM_L2x4_SUB4
 
-.LCTRMM_L2x4_LOOP_START:
+CTRMM_L2x4_LOOP_START:
 
        LOAD2x4_1
        KERNEL2x4_I1
@@ -817,11 +816,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L2x4_LOOP_END
+       ble             CTRMM_L2x4_LOOP_END
 
        .align 5
 
-.LCTRMM_L2x4_LOOP:
+CTRMM_L2x4_LOOP:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -834,9 +833,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x4_LOOP
+       bgt             CTRMM_L2x4_LOOP
 
-.LCTRMM_L2x4_LOOP_END:
+CTRMM_L2x4_LOOP_END:
 
        KERNEL2x4_1
        KERNEL2x4_2
@@ -848,9 +847,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_1
        KERNEL2x4_E2
 
-       b               .LCTRMM_L2x4_SUB1
+       b               CTRMM_L2x4_SUB1
 
-.LCTRMM_L2x4_SUB4:
+CTRMM_L2x4_SUB4:
 
        KERNEL2x4_SUBI1
        KERNEL2x4_SUB1
@@ -862,31 +861,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x4_SUB1
        KERNEL2x4_SUB1
 
-       b               .LCTRMM_L2x4_SUB1
+       b               CTRMM_L2x4_SUB1
 
-.LCTRMM_L2x4_SUB0:
+CTRMM_L2x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L2x4_SAVE
-       b               .LCTRMM_L2x4_SUB2
+       ble             CTRMM_L2x4_SAVE
+       b               CTRMM_L2x4_SUB2
 
-.LCTRMM_L2x4_SUB1:
+CTRMM_L2x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L2x4_SAVE
+       ble             CTRMM_L2x4_SAVE
 
-.LCTRMM_L2x4_SUB2:
+CTRMM_L2x4_SUB2:
 
        KERNEL2x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x4_SUB2
+       bgt             CTRMM_L2x4_SUB2
 
-.LCTRMM_L2x4_SAVE:
+CTRMM_L2x4_SAVE:
 
        SAVE2x4
 
@@ -903,12 +902,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L2x4_END:
+CTRMM_L2x4_END:
 
-.LCTRMM_L2x2_BEGIN:
+CTRMM_L2x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LCTRMM_L2x2_END
+       ble             CTRMM_L2x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -934,11 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L2x2_SUB0
+       ble             CTRMM_L2x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L2x2_SUB4
+       ble             CTRMM_L2x2_SUB4
 
-.LCTRMM_L2x2_LOOP_START:
+CTRMM_L2x2_LOOP_START:
 
        LOAD2x2_1
        KERNEL2x2_I1
@@ -952,11 +951,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L2x2_LOOP_END
+       ble             CTRMM_L2x2_LOOP_END
 
        .align 5
 
-.LCTRMM_L2x2_LOOP:
+CTRMM_L2x2_LOOP:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -969,9 +968,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x2_LOOP
+       bgt             CTRMM_L2x2_LOOP
 
-.LCTRMM_L2x2_LOOP_END:
+CTRMM_L2x2_LOOP_END:
 
        KERNEL2x2_1
        KERNEL2x2_2
@@ -983,9 +982,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_1
        KERNEL2x2_E2
 
-       b               .LCTRMM_L2x2_SUB1
+       b               CTRMM_L2x2_SUB1
 
-.LCTRMM_L2x2_SUB4:
+CTRMM_L2x2_SUB4:
 
        KERNEL2x2_SUBI1
        KERNEL2x2_SUB1
@@ -997,31 +996,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x2_SUB1
        KERNEL2x2_SUB1
 
-       b               .LCTRMM_L2x2_SUB1
+       b               CTRMM_L2x2_SUB1
 
-.LCTRMM_L2x2_SUB0:
+CTRMM_L2x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L2x2_SAVE
-       b               .LCTRMM_L2x2_SUB2
+       ble             CTRMM_L2x2_SAVE
+       b               CTRMM_L2x2_SUB2
 
-.LCTRMM_L2x2_SUB1:
+CTRMM_L2x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L2x2_SAVE
+       ble             CTRMM_L2x2_SAVE
 
-.LCTRMM_L2x2_SUB2:
+CTRMM_L2x2_SUB2:
 
        KERNEL2x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x2_SUB2
+       bgt             CTRMM_L2x2_SUB2
 
-.LCTRMM_L2x2_SAVE:
+CTRMM_L2x2_SAVE:
 
        SAVE2x2
 
@@ -1038,12 +1037,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L2x2_END:
+CTRMM_L2x2_END:
 
-.LCTRMM_L2x1_BEGIN:
+CTRMM_L2x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LCTRMM_L2x1_END
+       ble             CTRMM_L2x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1069,11 +1068,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L2x1_SUB0
+       ble             CTRMM_L2x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L2x1_SUB4
+       ble             CTRMM_L2x1_SUB4
 
-.LCTRMM_L2x1_LOOP_START:
+CTRMM_L2x1_LOOP_START:
 
        LOAD2x1_1
        KERNEL2x1_I1
@@ -1087,11 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L2x1_LOOP_END
+       ble             CTRMM_L2x1_LOOP_END
 
        .align 5
 
-.LCTRMM_L2x1_LOOP:
+CTRMM_L2x1_LOOP:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -1104,9 +1103,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x1_LOOP
+       bgt             CTRMM_L2x1_LOOP
 
-.LCTRMM_L2x1_LOOP_END:
+CTRMM_L2x1_LOOP_END:
 
        KERNEL2x1_1
        KERNEL2x1_2
@@ -1118,9 +1117,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_1
        KERNEL2x1_E2
 
-       b               .LCTRMM_L2x1_SUB1
+       b               CTRMM_L2x1_SUB1
 
-.LCTRMM_L2x1_SUB4:
+CTRMM_L2x1_SUB4:
 
        KERNEL2x1_SUBI1
        KERNEL2x1_SUB1
@@ -1132,31 +1131,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL2x1_SUB1
        KERNEL2x1_SUB1
 
-       b               .LCTRMM_L2x1_SUB1
+       b               CTRMM_L2x1_SUB1
 
-.LCTRMM_L2x1_SUB0:
+CTRMM_L2x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL2x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L2x1_SAVE
-       b               .LCTRMM_L2x1_SUB2
+       ble             CTRMM_L2x1_SAVE
+       b               CTRMM_L2x1_SUB2
 
-.LCTRMM_L2x1_SUB1:
+CTRMM_L2x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L2x1_SAVE
+       ble             CTRMM_L2x1_SAVE
 
-.LCTRMM_L2x1_SUB2:
+CTRMM_L2x1_SUB2:
 
        KERNEL2x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L2x1_SUB2
+       bgt             CTRMM_L2x1_SUB2
 
-.LCTRMM_L2x1_SAVE:
+CTRMM_L2x1_SAVE:
 
        SAVE2x1
 
@@ -1173,7 +1172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L2x1_END:
+CTRMM_L2x1_END:
 
        slwi            T1,     K,      4
        add             B,      B,      T1
@@ -1183,18 +1182,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L2_END:
+CTRMM_L2_END:
 
-       b               .LCTRMM_L1_BEGIN
+       b               CTRMM_L1_BEGIN
 
-.L999_H2:
+L999_H2:
 
-       b               .L999
+       b               L999
 
-.LCTRMM_L1_BEGIN:
+CTRMM_L1_BEGIN:
 
        andi.           T1,     N,      1
-       ble             .LCTRMM_L1_END
+       ble             CTRMM_L1_END
        mr              CO,     C
        mr              AO,     A
 
@@ -1203,9 +1202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
        srawi.          I,      M,      3
-       ble             .LCTRMM_L1x8_END
+       ble             CTRMM_L1x8_END
 
-.LCTRMM_L1x8_BEGIN:
+CTRMM_L1x8_BEGIN:
 
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1232,11 +1231,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L1x8_SUB0
+       ble             CTRMM_L1x8_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L1x8_SUB4
+       ble             CTRMM_L1x8_SUB4
 
-.LCTRMM_L1x8_LOOP_START:
+CTRMM_L1x8_LOOP_START:
 
        LOAD1x8_1
        KERNEL1x8_I1
@@ -1250,11 +1249,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L1x8_LOOP_END
+       ble             CTRMM_L1x8_LOOP_END
 
        .align 5
 
-.LCTRMM_L1x8_LOOP:
+CTRMM_L1x8_LOOP:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -1267,9 +1266,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x8_LOOP
+       bgt             CTRMM_L1x8_LOOP
 
-.LCTRMM_L1x8_LOOP_END:
+CTRMM_L1x8_LOOP_END:
 
        KERNEL1x8_1
        KERNEL1x8_2
@@ -1281,9 +1280,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_1
        KERNEL1x8_E2
 
-       b               .LCTRMM_L1x8_SUB1
+       b               CTRMM_L1x8_SUB1
 
-.LCTRMM_L1x8_SUB4:
+CTRMM_L1x8_SUB4:
 
        KERNEL1x8_SUBI1
        KERNEL1x8_SUB1
@@ -1295,31 +1294,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x8_SUB1
        KERNEL1x8_SUB1
 
-       b               .LCTRMM_L1x8_SUB1
+       b               CTRMM_L1x8_SUB1
 
-.LCTRMM_L1x8_SUB0:
+CTRMM_L1x8_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x8_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L1x8_SAVE
-       b               .LCTRMM_L1x8_SUB2
+       ble             CTRMM_L1x8_SAVE
+       b               CTRMM_L1x8_SUB2
 
-.LCTRMM_L1x8_SUB1:
+CTRMM_L1x8_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L1x8_SAVE
+       ble             CTRMM_L1x8_SAVE
 
-.LCTRMM_L1x8_SUB2:
+CTRMM_L1x8_SUB2:
 
        KERNEL1x8_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x8_SUB2
+       bgt             CTRMM_L1x8_SUB2
 
-.LCTRMM_L1x8_SAVE:
+CTRMM_L1x8_SAVE:
 
        SAVE1x8
 
@@ -1337,16 +1336,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
        addic.          I,      I,      -1
-       bgt             .LCTRMM_L1x8_BEGIN
+       bgt             CTRMM_L1x8_BEGIN
 
-.LCTRMM_L1x8_END:
+CTRMM_L1x8_END:
 
-.LCTRMM_L1x4_BEGIN:
+CTRMM_L1x4_BEGIN:
        andi.           T2,     M,      7
-       ble             .LCTRMM_L1x1_END
+       ble             CTRMM_L1x1_END
 
        andi.           T1,     M,      4
-       ble             .LCTRMM_L1x4_END
+       ble             CTRMM_L1x4_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1372,11 +1371,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L1x4_SUB0
+       ble             CTRMM_L1x4_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L1x4_SUB4
+       ble             CTRMM_L1x4_SUB4
 
-.LCTRMM_L1x4_LOOP_START:
+CTRMM_L1x4_LOOP_START:
 
        LOAD1x4_1
        KERNEL1x4_I1
@@ -1390,11 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L1x4_LOOP_END
+       ble             CTRMM_L1x4_LOOP_END
 
        .align 5
 
-.LCTRMM_L1x4_LOOP:
+CTRMM_L1x4_LOOP:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1407,9 +1406,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x4_LOOP
+       bgt             CTRMM_L1x4_LOOP
 
-.LCTRMM_L1x4_LOOP_END:
+CTRMM_L1x4_LOOP_END:
 
        KERNEL1x4_1
        KERNEL1x4_2
@@ -1421,9 +1420,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_1
        KERNEL1x4_E2
 
-       b               .LCTRMM_L1x4_SUB1
+       b               CTRMM_L1x4_SUB1
 
-.LCTRMM_L1x4_SUB4:
+CTRMM_L1x4_SUB4:
 
        KERNEL1x4_SUBI1
        KERNEL1x4_SUB1
@@ -1435,31 +1434,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x4_SUB1
        KERNEL1x4_SUB1
 
-       b               .LCTRMM_L1x4_SUB1
+       b               CTRMM_L1x4_SUB1
 
-.LCTRMM_L1x4_SUB0:
+CTRMM_L1x4_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x4_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L1x4_SAVE
-       b               .LCTRMM_L1x4_SUB2
+       ble             CTRMM_L1x4_SAVE
+       b               CTRMM_L1x4_SUB2
 
-.LCTRMM_L1x4_SUB1:
+CTRMM_L1x4_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L1x4_SAVE
+       ble             CTRMM_L1x4_SAVE
 
-.LCTRMM_L1x4_SUB2:
+CTRMM_L1x4_SUB2:
 
        KERNEL1x4_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x4_SUB2
+       bgt             CTRMM_L1x4_SUB2
 
-.LCTRMM_L1x4_SAVE:
+CTRMM_L1x4_SAVE:
 
        SAVE1x4
 
@@ -1476,12 +1475,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L1x4_END:
+CTRMM_L1x4_END:
 
-.LCTRMM_L1x2_BEGIN:
+CTRMM_L1x2_BEGIN:
 
        andi.           T1,     M,      2
-       ble             .LCTRMM_L1x2_END
+       ble             CTRMM_L1x2_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1507,11 +1506,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L1x2_SUB0
+       ble             CTRMM_L1x2_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L1x2_SUB4
+       ble             CTRMM_L1x2_SUB4
 
-.LCTRMM_L1x2_LOOP_START:
+CTRMM_L1x2_LOOP_START:
 
        LOAD1x2_1
        KERNEL1x2_I1
@@ -1525,11 +1524,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L1x2_LOOP_END
+       ble             CTRMM_L1x2_LOOP_END
 
        .align 5
 
-.LCTRMM_L1x2_LOOP:
+CTRMM_L1x2_LOOP:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -1542,9 +1541,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x2_LOOP
+       bgt             CTRMM_L1x2_LOOP
 
-.LCTRMM_L1x2_LOOP_END:
+CTRMM_L1x2_LOOP_END:
 
        KERNEL1x2_1
        KERNEL1x2_2
@@ -1556,9 +1555,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_1
        KERNEL1x2_E2
 
-       b               .LCTRMM_L1x2_SUB1
+       b               CTRMM_L1x2_SUB1
 
-.LCTRMM_L1x2_SUB4:
+CTRMM_L1x2_SUB4:
 
        KERNEL1x2_SUBI1
        KERNEL1x2_SUB1
@@ -1570,31 +1569,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x2_SUB1
        KERNEL1x2_SUB1
 
-       b               .LCTRMM_L1x2_SUB1
+       b               CTRMM_L1x2_SUB1
 
-.LCTRMM_L1x2_SUB0:
+CTRMM_L1x2_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x2_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L1x2_SAVE
-       b               .LCTRMM_L1x2_SUB2
+       ble             CTRMM_L1x2_SAVE
+       b               CTRMM_L1x2_SUB2
 
-.LCTRMM_L1x2_SUB1:
+CTRMM_L1x2_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L1x2_SAVE
+       ble             CTRMM_L1x2_SAVE
 
-.LCTRMM_L1x2_SUB2:
+CTRMM_L1x2_SUB2:
 
        KERNEL1x2_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x2_SUB2
+       bgt             CTRMM_L1x2_SUB2
 
-.LCTRMM_L1x2_SAVE:
+CTRMM_L1x2_SAVE:
 
        SAVE1x2
 
@@ -1611,12 +1610,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L1x2_END:
+CTRMM_L1x2_END:
 
-.LCTRMM_L1x1_BEGIN:
+CTRMM_L1x1_BEGIN:
 
        andi.           T1,     M,      1
-       ble             .LCTRMM_L1x1_END
+       ble             CTRMM_L1x1_END
 
 #if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
        mr              BO,     B                                       // B -> BO
@@ -1642,11 +1641,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        mr              KKK,    T1
        mr              K1,     T1
        srawi.          L,      K1,     3                               // KTEMP / 8 -> L
-       ble             .LCTRMM_L1x1_SUB0
+       ble             CTRMM_L1x1_SUB0
        cmpwi           cr0,    L,      1
-       ble             .LCTRMM_L1x1_SUB4
+       ble             CTRMM_L1x1_SUB4
 
-.LCTRMM_L1x1_LOOP_START:
+CTRMM_L1x1_LOOP_START:
 
        LOAD1x1_1
        KERNEL1x1_I1
@@ -1660,11 +1659,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -2
-       ble             .LCTRMM_L1x1_LOOP_END
+       ble             CTRMM_L1x1_LOOP_END
 
        .align 5
 
-.LCTRMM_L1x1_LOOP:
+CTRMM_L1x1_LOOP:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -1677,9 +1676,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_2
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x1_LOOP
+       bgt             CTRMM_L1x1_LOOP
 
-.LCTRMM_L1x1_LOOP_END:
+CTRMM_L1x1_LOOP_END:
 
        KERNEL1x1_1
        KERNEL1x1_2
@@ -1691,9 +1690,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_1
        KERNEL1x1_E2
 
-       b               .LCTRMM_L1x1_SUB1
+       b               CTRMM_L1x1_SUB1
 
-.LCTRMM_L1x1_SUB4:
+CTRMM_L1x1_SUB4:
 
        KERNEL1x1_SUBI1
        KERNEL1x1_SUB1
@@ -1705,31 +1704,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        KERNEL1x1_SUB1
        KERNEL1x1_SUB1
 
-       b               .LCTRMM_L1x1_SUB1
+       b               CTRMM_L1x1_SUB1
 
-.LCTRMM_L1x1_SUB0:
+CTRMM_L1x1_SUB0:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
 
        KERNEL1x1_SUBI1
 
        addic.          L,      L,      -1
-       ble             .LCTRMM_L1x1_SAVE
-       b               .LCTRMM_L1x1_SUB2
+       ble             CTRMM_L1x1_SAVE
+       b               CTRMM_L1x1_SUB2
 
-.LCTRMM_L1x1_SUB1:
+CTRMM_L1x1_SUB1:
 
        andi.           L,      K1,     7                                               // K1 & 7 -> L
-       ble             .LCTRMM_L1x1_SAVE
+       ble             CTRMM_L1x1_SAVE
 
-.LCTRMM_L1x1_SUB2:
+CTRMM_L1x1_SUB2:
 
        KERNEL1x1_SUB1
 
        addic.          L,      L,      -1
-       bgt             .LCTRMM_L1x1_SUB2
+       bgt             CTRMM_L1x1_SUB2
 
-.LCTRMM_L1x1_SAVE:
+CTRMM_L1x1_SAVE:
 
        SAVE1x1
 
@@ -1746,11 +1745,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-.LCTRMM_L1x1_END:
+CTRMM_L1x1_END:
 
 #if !defined(LEFT)
        addi            KK,     KK,     1                                       // KK += Number of values in B
 #endif
 
 
-.LCTRMM_L1_END:
+CTRMM_L1_END: