*****************************************************************************/
/**************************************************************************************
-* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
+* LAPACK-TEST : OK
**************************************************************************************/
/**********************************************************************************************
.macro LOAD8x16_1
- lxvw4x vs28, o0, BO
- lxvw4x vs29, o16, BO
-
lxvw4x vs0, o0, AO
lxvw4x vs1, o16, AO
lxvw4x vs2, o32, AO
lxvw4x vs3, o48, AO
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
xxspltw vs8, vs28, 0
xxspltw vs9, vs28, 1
xxspltw vs10, vs28, 2
xxspltw vs11, vs28, 3
+ lxvw4x vs29, o16, BO
+
xxspltw vs12, vs29, 0
xxspltw vs13, vs29, 1
- addi AO, AO, 64
- addi BO, BO, 32
xxspltw vs14, vs29, 2
xxspltw vs15, vs29, 3
+ addi BO, BO, 32
.endm
.macro KERNEL8x16_I1
- xvmulsp vs32, vs0, vs8
- xvmulsp vs33, vs1, vs8
lxvw4x vs4, o0, AO
lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
- xvmulsp vs34, vs2, vs8
- xvmulsp vs35, vs3, vs8
+ addi AO, AO, 64
lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
lxvw4x vs29, o16, BO
- xvmulsp vs36, vs0, vs9
- xvmulsp vs37, vs1, vs9
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
- lxvw4x vs6, o32, AO
- lxvw4x vs7, o48, AO
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
xvmulsp vs38, vs2, vs9
xvmulsp vs39, vs3, vs9
xvmulsp vs54, vs2, vs13
xvmulsp vs55, vs3, vs13
- xxspltw vs16, vs28, 0
- xxspltw vs17, vs28, 1
- xxspltw vs18, vs28, 2
- xxspltw vs19, vs28, 3
-
xvmulsp vs56, vs0, vs14
xvmulsp vs57, vs1, vs14
xvmulsp vs58, vs2, vs14
xvmulsp vs59, vs3, vs14
- xxspltw vs20, vs29, 0
- xxspltw vs21, vs29, 1
- xxspltw vs22, vs29, 2
- xxspltw vs23, vs29, 3
-
xvmulsp vs60, vs0, vs15
xvmulsp vs61, vs1, vs15
-
- addi AO, AO, 64
- addi BO, BO, 32
-
xvmulsp vs62, vs2, vs15
xvmulsp vs63, vs3, vs15
xvmaddasp vs32, vs0, vs8
xvmaddasp vs33, vs1, vs8
-
+ lxvw4x vs28, o0, BO
lxvw4x vs4, o0, AO
- lxvw4x vs5, o16, AO
-
xvmaddasp vs34, vs2, vs8
xvmaddasp vs35, vs3, vs8
xvmaddasp vs36, vs0, vs9
xvmaddasp vs37, vs1, vs9
- lxvw4x vs28, o0, BO
+ lxvw4x vs29, o16, BO
+ lxvw4x vs5, o16, AO
xvmaddasp vs38, vs2, vs9
xvmaddasp vs39, vs3, vs9
xvmaddasp vs40, vs0, vs10
xvmaddasp vs41, vs1, vs10
-
lxvw4x vs6, o32, AO
lxvw4x vs7, o48, AO
-
xvmaddasp vs42, vs2, vs10
xvmaddasp vs43, vs3, vs10
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
xvmaddasp vs44, vs0, vs11
xvmaddasp vs45, vs1, vs11
-
- lxvw4x vs29, o16, BO
-
xvmaddasp vs46, vs2, vs11
xvmaddasp vs47, vs3, vs11
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
xvmaddasp vs48, vs0, vs12
xvmaddasp vs49, vs1, vs12
xvmaddasp vs50, vs2, vs12
xvmaddasp vs52, vs0, vs13
xvmaddasp vs53, vs1, vs13
-
- xxspltw vs16, vs28, 0
- xxspltw vs17, vs28, 1
-
xvmaddasp vs54, vs2, vs13
xvmaddasp vs55, vs3, vs13
xvmaddasp vs56, vs0, vs14
xvmaddasp vs57, vs1, vs14
-
- xxspltw vs18, vs28, 2
- xxspltw vs19, vs28, 3
-
+ addi AO, AO, 64
+ addi BO, BO, 32
xvmaddasp vs58, vs2, vs14
xvmaddasp vs59, vs3, vs14
- xxspltw vs20, vs29, 0
- xxspltw vs21, vs29, 1
-
xvmaddasp vs60, vs0, vs15
xvmaddasp vs61, vs1, vs15
-
- addi AO, AO, 64
- addi BO, BO, 32
-
xvmaddasp vs62, vs2, vs15
xvmaddasp vs63, vs3, vs15
- xxspltw vs22, vs29, 2
- xxspltw vs23, vs29, 3
.endm
xvmaddasp vs32, vs4, vs16
xvmaddasp vs33, vs5, vs16
+ lxvw4x vs28, o0, BO
lxvw4x vs0, o0, AO
- lxvw4x vs1, o16, AO
xvmaddasp vs34, vs6, vs16
xvmaddasp vs35, vs7, vs16
xvmaddasp vs36, vs4, vs17
xvmaddasp vs37, vs5, vs17
- lxvw4x vs28, o0, BO
+ lxvw4x vs29, o16, BO
+ lxvw4x vs1, o16, AO
xvmaddasp vs38, vs6, vs17
xvmaddasp vs39, vs7, vs17
- xvmaddasp vs40, vs4, vs18
- xvmaddasp vs41, vs5, vs18
-
lxvw4x vs2, o32, AO
lxvw4x vs3, o48, AO
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
xvmaddasp vs42, vs6, vs18
xvmaddasp vs43, vs7, vs18
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
xvmaddasp vs44, vs4, vs19
xvmaddasp vs45, vs5, vs19
-
- lxvw4x vs29, o16, BO
-
xvmaddasp vs46, vs6, vs19
xvmaddasp vs47, vs7, vs19
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
xvmaddasp vs48, vs4, vs20
xvmaddasp vs49, vs5, vs20
xvmaddasp vs50, vs6, vs20
xvmaddasp vs52, vs4, vs21
xvmaddasp vs53, vs5, vs21
-
- xxspltw vs8, vs28, 0
- xxspltw vs9, vs28, 1
- xxspltw vs10, vs28, 2
- xxspltw vs11, vs28, 3
-
xvmaddasp vs54, vs6, vs21
xvmaddasp vs55, vs7, vs21
xvmaddasp vs56, vs4, vs22
xvmaddasp vs57, vs5, vs22
-
- xxspltw vs12, vs29, 0
- xxspltw vs13, vs29, 1
- xxspltw vs14, vs29, 2
- xxspltw vs15, vs29, 3
-
xvmaddasp vs58, vs6, vs22
xvmaddasp vs59, vs7, vs22
xvmaddasp vs60, vs4, vs23
xvmaddasp vs61, vs5, vs23
-
addi AO, AO, 64
addi BO, BO, 32
-
xvmaddasp vs62, vs6, vs23
xvmaddasp vs63, vs7, vs23
#endif
+
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs33, o0, TBUFFER
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
- xvmulsp vs2, vs34, alpha_vr
- xvmulsp vs3, vs35, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
- xvmaddasp vs2, vs34, alpha_vr
- xvmaddasp vs3, vs35, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
#endif
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
+#endif
+
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs36, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs37, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs36, alpha_vr
- xvmulsp vs1, vs37, alpha_vr
- xvmulsp vs2, vs38, alpha_vr
- xvmulsp vs3, vs39, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs38, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs36, alpha_vr
- xvmaddasp vs1, vs37, alpha_vr
- xvmaddasp vs2, vs38, alpha_vr
- xvmaddasp vs3, vs39, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
#endif
+ stxvw4x vs39, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
+#endif
+
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs40, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs41, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs42, o0, TBUFFER
- xvmulsp vs0, vs40, alpha_vr
- xvmulsp vs1, vs41, alpha_vr
- xvmulsp vs2, vs42, alpha_vr
- xvmulsp vs3, vs43, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs43, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmaddasp vs0, vs40, alpha_vr
- xvmaddasp vs1, vs41, alpha_vr
- xvmaddasp vs2, vs42, alpha_vr
- xvmaddasp vs3, vs43, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs44, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs45, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
- xvmulsp vs0, vs44, alpha_vr
- xvmulsp vs1, vs45, alpha_vr
- xvmulsp vs2, vs46, alpha_vr
- xvmulsp vs3, vs47, alpha_vr
+ stxvw4x vs46, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs47, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs44, alpha_vr
- xvmaddasp vs1, vs45, alpha_vr
- xvmaddasp vs2, vs46, alpha_vr
- xvmaddasp vs3, vs47, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs48, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs49, o0, TBUFFER
- xvmulsp vs0, vs48, alpha_vr
- xvmulsp vs1, vs49, alpha_vr
- xvmulsp vs2, vs50, alpha_vr
- xvmulsp vs3, vs51, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs50, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmaddasp vs0, vs48, alpha_vr
- xvmaddasp vs1, vs49, alpha_vr
- xvmaddasp vs2, vs50, alpha_vr
- xvmaddasp vs3, vs51, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
#endif
+ stxvw4x vs51, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
+#endif
+
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs52, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs53, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs54, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs52, alpha_vr
- xvmulsp vs1, vs53, alpha_vr
- xvmulsp vs2, vs54, alpha_vr
- xvmulsp vs3, vs55, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs55, o0, TBUFFER
- xvmaddasp vs0, vs52, alpha_vr
- xvmaddasp vs1, vs53, alpha_vr
- xvmaddasp vs2, vs54, alpha_vr
- xvmaddasp vs3, vs55, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs56, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
- xvmulsp vs0, vs56, alpha_vr
- xvmulsp vs1, vs57, alpha_vr
- xvmulsp vs2, vs58, alpha_vr
- xvmulsp vs3, vs59, alpha_vr
+ stxvw4x vs57, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs58, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs56, alpha_vr
- xvmaddasp vs1, vs57, alpha_vr
- xvmaddasp vs2, vs58, alpha_vr
- xvmaddasp vs3, vs59, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
#endif
+ stxvw4x vs59, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
+#endif
+
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs60, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs61, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs62, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmulsp vs0, vs60, alpha_vr
- xvmulsp vs1, vs61, alpha_vr
- xvmulsp vs2, vs62, alpha_vr
- xvmulsp vs3, vs63, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
- xvmaddasp vs0, vs60, alpha_vr
- xvmaddasp vs1, vs61, alpha_vr
- xvmaddasp vs2, vs62, alpha_vr
- xvmaddasp vs3, vs63, alpha_vr
+ stxvw4x vs63, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs33, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+
+
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+
+
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+
+ stxvw4x vs36, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs37, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
-#else
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
-#endif
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
-
- xvmulsp vs0, vs34, alpha_vr
- xvmulsp vs1, vs35, alpha_vr
-#else
+ stxvw4x vs38, o0, TBUFFER
- xvmaddasp vs0, vs34, alpha_vr
- xvmaddasp vs1, vs35, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
-#endif
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- stxvw4x vs0, o0, T1
- stxvw4x vs1, o16, T1
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
- add T1, T1, LDC
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+ stxvw4x vs39, o0, TBUFFER
-#ifndef TRMMKERNEL
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- lxvw4x vs0, o0, T1
- lxvw4x vs1, o16, T1
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
-#endif
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
#ifdef TRMMKERNEL
-
- xvmulsp vs0, vs36, alpha_vr
- xvmulsp vs1, vs37, alpha_vr
-
+ lxvw4x vs1, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
- xvmaddasp vs0, vs36, alpha_vr
- xvmaddasp vs1, vs37, alpha_vr
-#endif
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
-
- xvmulsp vs0, vs38, alpha_vr
- xvmulsp vs1, vs39, alpha_vr
-#else
+ stxvw4x vs40, o0, TBUFFER
- xvmaddasp vs0, vs38, alpha_vr
- xvmaddasp vs1, vs39, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
-#endif
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- stxvw4x vs0, o0, T1
- stxvw4x vs1, o16, T1
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
- add T1, T1, LDC
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+ stxvw4x vs41, o0, TBUFFER
-#ifndef TRMMKERNEL
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- lxvw4x vs0, o0, T1
- lxvw4x vs1, o16, T1
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
-#endif
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
#ifdef TRMMKERNEL
-
- xvmulsp vs0, vs40, alpha_vr
- xvmulsp vs1, vs41, alpha_vr
-
+ lxvw4x vs1, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
- xvmaddasp vs0, vs40, alpha_vr
- xvmaddasp vs1, vs41, alpha_vr
-#endif
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs42, alpha_vr
- xvmulsp vs1, vs43, alpha_vr
+ stxvw4x vs42, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs43, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs42, alpha_vr
- xvmaddasp vs1, vs43, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs44, alpha_vr
- xvmulsp vs1, vs45, alpha_vr
+ stxvw4x vs44, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
- xvmaddasp vs0, vs44, alpha_vr
- xvmaddasp vs1, vs45, alpha_vr
+ stxvw4x vs45, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs46, alpha_vr
- xvmulsp vs1, vs47, alpha_vr
+ stxvw4x vs46, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs47, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs46, alpha_vr
- xvmaddasp vs1, vs47, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs33, alpha_vr
+ stxvw4x vs33, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs33, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs34, alpha_vr
+ stxvw4x vs34, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs34, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs35, alpha_vr
+ stxvw4x vs35, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs36, alpha_vr
+ stxvw4x vs36, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs36, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs37, alpha_vr
+ stxvw4x vs37, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs37, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs38, alpha_vr
+ stxvw4x vs38, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs38, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs39, alpha_vr
+ stxvw4x vs39, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs39, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#else
- xsmaddasp vs0, vs32, alpha_r
- xsmaddasp vs1, vs33, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs34, alpha_r
- xsmaddasp vs1, vs35, alpha_r
+ xsmulsp vs28, vs34, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs35, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs36, alpha_r
- xsmaddasp vs1, vs37, alpha_r
+ xsmulsp vs28, vs36, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs37, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs38, alpha_r
- xsmaddasp vs1, vs39, alpha_r
+ xsmulsp vs28, vs38, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs39, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs40, alpha_r
- xsmaddasp vs1, vs41, alpha_r
+ xsmulsp vs28, vs40, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs41, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs42, alpha_r
- xsmaddasp vs1, vs43, alpha_r
+ xsmulsp vs28, vs42, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs43, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs44, alpha_r
- xsmaddasp vs1, vs45, alpha_r
+ xsmulsp vs28, vs44, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs45, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs46, alpha_r
- xsmaddasp vs1, vs47, alpha_r
+ xsmulsp vs28, vs46, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs47, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs32, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs33, alpha_r
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs34, alpha_r
+ xsmulsp vs28, vs34, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs35, alpha_r
+ xsmulsp vs28, vs35, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs36, alpha_r
+ xsmulsp vs28, vs36, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs37, alpha_r
+ xsmulsp vs28, vs37, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs38, alpha_r
+ xsmulsp vs28, vs38, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs39, alpha_r
+ xsmulsp vs28, vs39, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#endif
+
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs33, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
- xvmulsp vs2, vs34, alpha_vr
- xvmulsp vs3, vs35, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
- xvmaddasp vs2, vs34, alpha_vr
- xvmaddasp vs3, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs36, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs37, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs38, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs36, alpha_vr
- xvmulsp vs1, vs37, alpha_vr
- xvmulsp vs2, vs38, alpha_vr
- xvmulsp vs3, vs39, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs39, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs36, alpha_vr
- xvmaddasp vs1, vs37, alpha_vr
- xvmaddasp vs2, vs38, alpha_vr
- xvmaddasp vs3, vs39, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs40, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs41, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs42, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs40, alpha_vr
- xvmulsp vs1, vs41, alpha_vr
- xvmulsp vs2, vs42, alpha_vr
- xvmulsp vs3, vs43, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs43, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs40, alpha_vr
- xvmaddasp vs1, vs41, alpha_vr
- xvmaddasp vs2, vs42, alpha_vr
- xvmaddasp vs3, vs43, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
stxvw4x vs3, o48, T1
- add T1, T1, LDC
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+
+ stxvw4x vs44, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs45, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs46, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
-#ifndef TRMMKERNEL
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- lxvw4x vs0, o0, T1
- lxvw4x vs1, o16, T1
- lxvw4x vs2, o32, T1
- lxvw4x vs3, o48, T1
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
#endif
-#ifdef TRMMKERNEL
+ stxvw4x vs47, o0, TBUFFER
- xvmulsp vs0, vs44, alpha_vr
- xvmulsp vs1, vs45, alpha_vr
- xvmulsp vs2, vs46, alpha_vr
- xvmulsp vs3, vs47, alpha_vr
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
-#else
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmaddasp vs0, vs44, alpha_vr
- xvmaddasp vs1, vs45, alpha_vr
- xvmaddasp vs2, vs46, alpha_vr
- xvmaddasp vs3, vs47, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
+ stxvw4x vs33, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs34, alpha_vr
- xvmulsp vs1, vs35, alpha_vr
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs34, alpha_vr
- xvmaddasp vs1, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs36, alpha_vr
- xvmulsp vs1, vs37, alpha_vr
+ stxvw4x vs36, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
- xvmaddasp vs0, vs36, alpha_vr
- xvmaddasp vs1, vs37, alpha_vr
+ stxvw4x vs37, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs38, alpha_vr
- xvmulsp vs1, vs39, alpha_vr
+ stxvw4x vs38, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs39, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs38, alpha_vr
- xvmaddasp vs1, vs39, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs33, alpha_vr
+ stxvw4x vs33, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs33, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs34, alpha_vr
+ stxvw4x vs34, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs34, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs35, alpha_vr
+ stxvw4x vs35, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#else
- xsmaddasp vs0, vs32, alpha_r
- xsmaddasp vs1, vs33, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs34, alpha_r
- xsmaddasp vs1, vs35, alpha_r
+ xsmulsp vs28, vs34, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs35, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs36, alpha_r
- xsmaddasp vs1, vs37, alpha_r
+ xsmulsp vs28, vs36, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs37, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs38, alpha_r
- xsmaddasp vs1, vs39, alpha_r
+ xsmulsp vs28, vs38, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs39, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs32, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs33, alpha_r
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs34, alpha_r
+ xsmulsp vs28, vs34, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs35, alpha_r
+ xsmulsp vs28, vs35, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#endif
+
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs33, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
- xvmulsp vs2, vs34, alpha_vr
- xvmulsp vs3, vs35, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
- xvmaddasp vs2, vs34, alpha_vr
- xvmaddasp vs3, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
+
+ stxvw4x vs36, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs37, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs38, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs36, alpha_vr
- xvmulsp vs1, vs37, alpha_vr
- xvmulsp vs2, vs38, alpha_vr
- xvmulsp vs3, vs39, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs39, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs36, alpha_vr
- xvmaddasp vs1, vs37, alpha_vr
- xvmaddasp vs2, vs38, alpha_vr
- xvmaddasp vs3, vs39, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
+ stxvw4x vs33, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs34, alpha_vr
- xvmulsp vs1, vs35, alpha_vr
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs34, alpha_vr
- xvmaddasp vs1, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs33, alpha_vr
+ stxvw4x vs33, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs33, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#else
- xsmaddasp vs0, vs32, alpha_r
- xsmaddasp vs1, vs33, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs34, alpha_r
- xsmaddasp vs1, vs35, alpha_r
+ xsmulsp vs28, vs34, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs35, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs32, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#else
- xsmaddasp vs0, vs33, alpha_r
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif
#endif
+
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
+
+ stxvw4x vs33, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
+#endif
+
+ stxvw4x vs34, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
- xvmulsp vs2, vs34, alpha_vr
- xvmulsp vs3, vs35, alpha_vr
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+#ifdef TRMMKERNEL
+ lxvw4x vs2, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs2, vs2, vs28
+#endif
+
+ stxvw4x vs35, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
- xvmaddasp vs2, vs34, alpha_vr
- xvmaddasp vs3, vs35, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs3, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs3, vs3, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
stxvw4x vs2, o32, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
- xvmulsp vs1, vs33, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
+
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
+#endif
- xvmaddasp vs0, vs32, alpha_vr
- xvmaddasp vs1, vs33, alpha_vr
+ stxvw4x vs33, o0, TBUFFER
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
+
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs1, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs1, vs1, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
stxvw4x vs1, o16, T1
#endif
-#ifdef TRMMKERNEL
- xvmulsp vs0, vs32, alpha_vr
+ stxvw4x vs32, o0, TBUFFER
-#else
+ lxsspx vs4, o0, TBUFFER
+ lxsspx vs5, o4, TBUFFER
+ lxsspx vs6, o8, TBUFFER
+ lxsspx vs7, o12, TBUFFER
- xvmaddasp vs0, vs32, alpha_vr
+ xsmulsp vs4, vs4, alpha_r
+ xsmulsp vs5, vs5, alpha_r
+ xsmulsp vs6, vs6, alpha_r
+ xsmulsp vs7, vs7, alpha_r
+ stxsspx vs4, o0, TBUFFER
+ stxsspx vs5, o4, TBUFFER
+ stxsspx vs6, o8, TBUFFER
+ stxsspx vs7, o12, TBUFFER
+
+#ifdef TRMMKERNEL
+ lxvw4x vs0, o0, TBUFFER
+#else
+ lxvw4x vs28, o0, TBUFFER
+ xvaddsp vs0, vs0, vs28
#endif
+
+
+
stxvw4x vs0, o0, T1
add T1, T1, LDC
#else
- xsmaddasp vs0, vs32, alpha_r
- xsmaddasp vs1, vs33, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
+ xsmulsp vs28, vs33, alpha_r
+ xsaddsp vs1, vs1, vs28
#endif
#else
- xsmaddasp vs0, vs32, alpha_r
+ xsmulsp vs28, vs32, alpha_r
+ xsaddsp vs0, vs0, vs28
#endif