--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+**************************************************************************************/
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+ lxvw4x vs28, o0, BO
+ lxvw4x vs29, o16, BO
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ addi AO, AO, 64
+ addi BO, BO, 32
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+
+.endm
+
+.macro KERNEL8x16_I1
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ lxvw4x vs28, o0, BO
+ lxvw4x vs29, o16, BO
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+ xvmulsp vs48, vs0, vs12
+ xvmulsp vs49, vs1, vs12
+ xvmulsp vs50, vs2, vs12
+ xvmulsp vs51, vs3, vs12
+
+ xvmulsp vs52, vs0, vs13
+ xvmulsp vs53, vs1, vs13
+ xvmulsp vs54, vs2, vs13
+ xvmulsp vs55, vs3, vs13
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ xvmulsp vs56, vs0, vs14
+ xvmulsp vs57, vs1, vs14
+ xvmulsp vs58, vs2, vs14
+ xvmulsp vs59, vs3, vs14
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ xvmulsp vs60, vs0, vs15
+ xvmulsp vs61, vs1, vs15
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ xvmulsp vs62, vs2, vs15
+ xvmulsp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ lxvw4x vs28, o0, BO
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+
+ lxvw4x vs29, o16, BO
+
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+ xvmaddasp vs48, vs0, vs12
+ xvmaddasp vs49, vs1, vs12
+ xvmaddasp vs50, vs2, vs12
+ xvmaddasp vs51, vs3, vs12
+
+ xvmaddasp vs52, vs0, vs13
+ xvmaddasp vs53, vs1, vs13
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ xvmaddasp vs54, vs2, vs13
+ xvmaddasp vs55, vs3, vs13
+
+ xvmaddasp vs56, vs0, vs14
+ xvmaddasp vs57, vs1, vs14
+
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ xvmaddasp vs58, vs2, vs14
+ xvmaddasp vs59, vs3, vs14
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+
+ xvmaddasp vs60, vs0, vs15
+ xvmaddasp vs61, vs1, vs15
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ xvmaddasp vs62, vs2, vs15
+ xvmaddasp vs63, vs3, vs15
+
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+.endm
+
+.macro KERNEL8x16_2
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+
+ lxvw4x vs28, o0, BO
+
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+
+ lxvw4x vs29, o16, BO
+
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+ xvmaddasp vs48, vs4, vs20
+ xvmaddasp vs49, vs5, vs20
+ xvmaddasp vs50, vs6, vs20
+ xvmaddasp vs51, vs7, vs20
+
+ xvmaddasp vs52, vs4, vs21
+ xvmaddasp vs53, vs5, vs21
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ xvmaddasp vs54, vs6, vs21
+ xvmaddasp vs55, vs7, vs21
+
+ xvmaddasp vs56, vs4, vs22
+ xvmaddasp vs57, vs5, vs22
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ xvmaddasp vs58, vs6, vs22
+ xvmaddasp vs59, vs7, vs22
+
+ xvmaddasp vs60, vs4, vs23
+ xvmaddasp vs61, vs5, vs23
+
+ addi AO, AO, 64
+ addi BO, BO, 32
+
+ xvmaddasp vs62, vs6, vs23
+ xvmaddasp vs63, vs7, vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+ xvmaddasp vs48, vs4, vs20
+ xvmaddasp vs49, vs5, vs20
+ xvmaddasp vs50, vs6, vs20
+ xvmaddasp vs51, vs7, vs20
+
+ xvmaddasp vs52, vs4, vs21
+ xvmaddasp vs53, vs5, vs21
+ xvmaddasp vs54, vs6, vs21
+ xvmaddasp vs55, vs7, vs21
+
+ xvmaddasp vs56, vs4, vs22
+ xvmaddasp vs57, vs5, vs22
+ xvmaddasp vs58, vs6, vs22
+ xvmaddasp vs59, vs7, vs22
+
+ xvmaddasp vs60, vs4, vs23
+ xvmaddasp vs61, vs5, vs23
+ xvmaddasp vs62, vs6, vs23
+ xvmaddasp vs63, vs7, vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+ xvmulsp vs48, vs0, vs12
+ xvmulsp vs49, vs1, vs12
+ xvmulsp vs50, vs2, vs12
+ xvmulsp vs51, vs3, vs12
+
+ xvmulsp vs52, vs0, vs13
+ xvmulsp vs53, vs1, vs13
+ xvmulsp vs54, vs2, vs13
+ xvmulsp vs55, vs3, vs13
+
+ xvmulsp vs56, vs0, vs14
+ xvmulsp vs57, vs1, vs14
+ xvmulsp vs58, vs2, vs14
+ xvmulsp vs59, vs3, vs14
+
+ xvmulsp vs60, vs0, vs15
+ xvmulsp vs61, vs1, vs15
+ xvmulsp vs62, vs2, vs15
+ xvmulsp vs63, vs3, vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+ xvmaddasp vs48, vs0, vs12
+ xvmaddasp vs49, vs1, vs12
+ xvmaddasp vs50, vs2, vs12
+ xvmaddasp vs51, vs3, vs12
+
+ xvmaddasp vs52, vs0, vs13
+ xvmaddasp vs53, vs1, vs13
+ xvmaddasp vs54, vs2, vs13
+ xvmaddasp vs55, vs3, vs13
+
+ xvmaddasp vs56, vs0, vs14
+ xvmaddasp vs57, vs1, vs14
+ xvmaddasp vs58, vs2, vs14
+ xvmaddasp vs59, vs3, vs14
+
+ xvmaddasp vs60, vs0, vs15
+ xvmaddasp vs61, vs1, vs15
+ xvmaddasp vs62, vs2, vs15
+ xvmaddasp vs63, vs3, vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+ xvmulsp vs2, vs42, alpha_vr
+ xvmulsp vs3, vs43, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+ xvmaddasp vs2, vs42, alpha_vr
+ xvmaddasp vs3, vs43, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+ xvmulsp vs2, vs46, alpha_vr
+ xvmulsp vs3, vs47, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+ xvmaddasp vs2, vs46, alpha_vr
+ xvmaddasp vs3, vs47, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs48, alpha_vr
+ xvmulsp vs1, vs49, alpha_vr
+ xvmulsp vs2, vs50, alpha_vr
+ xvmulsp vs3, vs51, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs48, alpha_vr
+ xvmaddasp vs1, vs49, alpha_vr
+ xvmaddasp vs2, vs50, alpha_vr
+ xvmaddasp vs3, vs51, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs52, alpha_vr
+ xvmulsp vs1, vs53, alpha_vr
+ xvmulsp vs2, vs54, alpha_vr
+ xvmulsp vs3, vs55, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs52, alpha_vr
+ xvmaddasp vs1, vs53, alpha_vr
+ xvmaddasp vs2, vs54, alpha_vr
+ xvmaddasp vs3, vs55, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs56, alpha_vr
+ xvmulsp vs1, vs57, alpha_vr
+ xvmulsp vs2, vs58, alpha_vr
+ xvmulsp vs3, vs59, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs56, alpha_vr
+ xvmaddasp vs1, vs57, alpha_vr
+ xvmaddasp vs2, vs58, alpha_vr
+ xvmaddasp vs3, vs59, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs60, alpha_vr
+ xvmulsp vs1, vs61, alpha_vr
+ xvmulsp vs2, vs62, alpha_vr
+ xvmulsp vs3, vs63, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs60, alpha_vr
+ xvmaddasp vs1, vs61, alpha_vr
+ xvmaddasp vs2, vs62, alpha_vr
+ xvmaddasp vs3, vs63, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+ xvmulsp vs40, vs0, vs12
+ xvmulsp vs41, vs1, vs12
+
+ xvmulsp vs42, vs0, vs13
+ xvmulsp vs43, vs1, vs13
+
+ xvmulsp vs44, vs0, vs14
+ xvmulsp vs45, vs1, vs14
+
+ xvmulsp vs46, vs0, vs15
+ xvmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+ xvmaddasp vs40, vs0, vs12
+ xvmaddasp vs41, vs1, vs12
+
+ xvmaddasp vs42, vs0, vs13
+ xvmaddasp vs43, vs1, vs13
+
+ xvmaddasp vs44, vs0, vs14
+ xvmaddasp vs45, vs1, vs14
+
+ xvmaddasp vs46, vs0, vs15
+ xvmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+ xvmaddasp vs40, vs4, vs20
+ xvmaddasp vs41, vs5, vs20
+
+ xvmaddasp vs42, vs4, vs21
+ xvmaddasp vs43, vs5, vs21
+
+ xvmaddasp vs44, vs4, vs22
+ xvmaddasp vs45, vs5, vs22
+
+ xvmaddasp vs46, vs4, vs23
+ xvmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+ xvmaddasp vs40, vs4, vs20
+ xvmaddasp vs41, vs5, vs20
+
+ xvmaddasp vs42, vs4, vs21
+ xvmaddasp vs43, vs5, vs21
+
+ xvmaddasp vs44, vs4, vs22
+ xvmaddasp vs45, vs5, vs22
+
+ xvmaddasp vs46, vs4, vs23
+ xvmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+ xvmulsp vs40, vs0, vs12
+ xvmulsp vs41, vs1, vs12
+
+ xvmulsp vs42, vs0, vs13
+ xvmulsp vs43, vs1, vs13
+
+ xvmulsp vs44, vs0, vs14
+ xvmulsp vs45, vs1, vs14
+
+ xvmulsp vs46, vs0, vs15
+ xvmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+ xvmaddasp vs40, vs0, vs12
+ xvmaddasp vs41, vs1, vs12
+
+ xvmaddasp vs42, vs0, vs13
+ xvmaddasp vs43, vs1, vs13
+
+ xvmaddasp vs44, vs0, vs14
+ xvmaddasp vs45, vs1, vs14
+
+ xvmaddasp vs46, vs0, vs15
+ xvmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs38, alpha_vr
+ xvmulsp vs1, vs39, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs38, alpha_vr
+ xvmaddasp vs1, vs39, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs42, alpha_vr
+ xvmulsp vs1, vs43, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs42, alpha_vr
+ xvmaddasp vs1, vs43, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs46, alpha_vr
+ xvmulsp vs1, vs47, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs46, alpha_vr
+ xvmaddasp vs1, vs47, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+ xvmulsp vs36, vs0, vs12
+
+ xvmulsp vs37, vs0, vs13
+
+ xvmulsp vs38, vs0, vs14
+
+ xvmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs20, vs29, 0
+ xxspltw vs21, vs29, 1
+ xxspltw vs22, vs29, 2
+ xxspltw vs23, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+ xvmaddasp vs36, vs0, vs12
+
+ xvmaddasp vs37, vs0, vs13
+
+ xvmaddasp vs38, vs0, vs14
+
+ xvmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+ xvmaddasp vs36, vs4, vs20
+
+ xvmaddasp vs37, vs4, vs21
+
+ xvmaddasp vs38, vs4, vs22
+
+ xvmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+ xvmaddasp vs36, vs4, vs20
+
+ xvmaddasp vs37, vs4, vs21
+
+ xvmaddasp vs38, vs4, vs22
+
+ xvmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+ xvmulsp vs36, vs0, vs12
+
+ xvmulsp vs37, vs0, vs13
+
+ xvmulsp vs38, vs0, vs14
+
+ xvmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ lxvw4x vs29, o16, BO
+
+ xxspltw vs12, vs29, 0
+ xxspltw vs13, vs29, 1
+ xxspltw vs14, vs29, 2
+ xxspltw vs15, vs29, 3
+
+ addi BO, BO, 32
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+ xvmaddasp vs36, vs0, vs12
+
+ xvmaddasp vs37, vs0, vs13
+
+ xvmaddasp vs38, vs0, vs14
+
+ xvmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs34, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs34, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs36, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs36, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs37, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs37, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs38, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs38, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs39, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs39, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+ xsmulsp vs34, vs0, vs9
+ xsmulsp vs35, vs1, vs9
+
+ xsmulsp vs36, vs0, vs10
+ xsmulsp vs37, vs1, vs10
+
+ xsmulsp vs38, vs0, vs11
+ xsmulsp vs39, vs1, vs11
+
+ xsmulsp vs40, vs0, vs12
+ xsmulsp vs41, vs1, vs12
+
+ xsmulsp vs42, vs0, vs13
+ xsmulsp vs43, vs1, vs13
+
+ xsmulsp vs44, vs0, vs14
+ xsmulsp vs45, vs1, vs14
+
+ xsmulsp vs46, vs0, vs15
+ xsmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+ xsmaddasp vs34, vs0, vs9
+ xsmaddasp vs35, vs1, vs9
+
+ xsmaddasp vs36, vs0, vs10
+ xsmaddasp vs37, vs1, vs10
+
+ xsmaddasp vs38, vs0, vs11
+ xsmaddasp vs39, vs1, vs11
+
+ xsmaddasp vs40, vs0, vs12
+ xsmaddasp vs41, vs1, vs12
+
+ xsmaddasp vs42, vs0, vs13
+ xsmaddasp vs43, vs1, vs13
+
+ xsmaddasp vs44, vs0, vs14
+ xsmaddasp vs45, vs1, vs14
+
+ xsmaddasp vs46, vs0, vs15
+ xsmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+ xsmaddasp vs34, vs4, vs17
+ xsmaddasp vs35, vs5, vs17
+
+ xsmaddasp vs36, vs4, vs18
+ xsmaddasp vs37, vs5, vs18
+
+ xsmaddasp vs38, vs4, vs19
+ xsmaddasp vs39, vs5, vs19
+
+ xsmaddasp vs40, vs4, vs20
+ xsmaddasp vs41, vs5, vs20
+
+ xsmaddasp vs42, vs4, vs21
+ xsmaddasp vs43, vs5, vs21
+
+ xsmaddasp vs44, vs4, vs22
+ xsmaddasp vs45, vs5, vs22
+
+ xsmaddasp vs46, vs4, vs23
+ xsmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+ xsmaddasp vs34, vs4, vs17
+ xsmaddasp vs35, vs5, vs17
+
+ xsmaddasp vs36, vs4, vs18
+ xsmaddasp vs37, vs5, vs18
+
+ xsmaddasp vs38, vs4, vs19
+ xsmaddasp vs39, vs5, vs19
+
+ xsmaddasp vs40, vs4, vs20
+ xsmaddasp vs41, vs5, vs20
+
+ xsmaddasp vs42, vs4, vs21
+ xsmaddasp vs43, vs5, vs21
+
+ xsmaddasp vs44, vs4, vs22
+ xsmaddasp vs45, vs5, vs22
+
+ xsmaddasp vs46, vs4, vs23
+ xsmaddasp vs47, vs5, vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+ xsmulsp vs34, vs0, vs9
+ xsmulsp vs35, vs1, vs9
+
+ xsmulsp vs36, vs0, vs10
+ xsmulsp vs37, vs1, vs10
+
+ xsmulsp vs38, vs0, vs11
+ xsmulsp vs39, vs1, vs11
+
+ xsmulsp vs40, vs0, vs12
+ xsmulsp vs41, vs1, vs12
+
+ xsmulsp vs42, vs0, vs13
+ xsmulsp vs43, vs1, vs13
+
+ xsmulsp vs44, vs0, vs14
+ xsmulsp vs45, vs1, vs14
+
+ xsmulsp vs46, vs0, vs15
+ xsmulsp vs47, vs1, vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+ xsmaddasp vs34, vs0, vs9
+ xsmaddasp vs35, vs1, vs9
+
+ xsmaddasp vs36, vs0, vs10
+ xsmaddasp vs37, vs1, vs10
+
+ xsmaddasp vs38, vs0, vs11
+ xsmaddasp vs39, vs1, vs11
+
+ xsmaddasp vs40, vs0, vs12
+ xsmaddasp vs41, vs1, vs12
+
+ xsmaddasp vs42, vs0, vs13
+ xsmaddasp vs43, vs1, vs13
+
+ xsmaddasp vs44, vs0, vs14
+ xsmaddasp vs45, vs1, vs14
+
+ xsmaddasp vs46, vs0, vs15
+ xsmaddasp vs47, vs1, vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+ xsmulsp vs1, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+ xsmaddasp vs1, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs34, alpha_r
+ xsmulsp vs1, vs35, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs34, alpha_r
+ xsmaddasp vs1, vs35, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs36, alpha_r
+ xsmulsp vs1, vs37, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs36, alpha_r
+ xsmaddasp vs1, vs37, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs38, alpha_r
+ xsmulsp vs1, vs39, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs38, alpha_r
+ xsmaddasp vs1, vs39, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs40, alpha_r
+ xsmulsp vs1, vs41, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs40, alpha_r
+ xsmaddasp vs1, vs41, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs42, alpha_r
+ xsmulsp vs1, vs43, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs42, alpha_r
+ xsmaddasp vs1, vs43, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs44, alpha_r
+ xsmulsp vs1, vs45, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs44, alpha_r
+ xsmaddasp vs1, vs45, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs46, alpha_r
+ xsmulsp vs1, vs47, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs46, alpha_r
+ xsmaddasp vs1, vs47, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmulsp vs32, vs0, vs8
+
+ xsmulsp vs33, vs0, vs9
+
+ xsmulsp vs34, vs0, vs10
+
+ xsmulsp vs35, vs0, vs11
+
+ xsmulsp vs36, vs0, vs12
+
+ xsmulsp vs37, vs0, vs13
+
+ xsmulsp vs38, vs0, vs14
+
+ xsmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs20, o0, T1
+ lxsspx vs21, o4, T1
+ lxsspx vs22, o8, T1
+ lxsspx vs23, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddasp vs32, vs0, vs8
+
+ xsmaddasp vs33, vs0, vs9
+
+ xsmaddasp vs34, vs0, vs10
+
+ xsmaddasp vs35, vs0, vs11
+
+ xsmaddasp vs36, vs0, vs12
+
+ xsmaddasp vs37, vs0, vs13
+
+ xsmaddasp vs38, vs0, vs14
+
+ xsmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddasp vs32, vs4, vs16
+
+ xsmaddasp vs33, vs4, vs17
+
+ xsmaddasp vs34, vs4, vs18
+
+ xsmaddasp vs35, vs4, vs19
+
+ xsmaddasp vs36, vs4, vs20
+
+ xsmaddasp vs37, vs4, vs21
+
+ xsmaddasp vs38, vs4, vs22
+
+ xsmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+
+ xsmaddasp vs33, vs4, vs17
+
+ xsmaddasp vs34, vs4, vs18
+
+ xsmaddasp vs35, vs4, vs19
+
+ xsmaddasp vs36, vs4, vs20
+
+ xsmaddasp vs37, vs4, vs21
+
+ xsmaddasp vs38, vs4, vs22
+
+ xsmaddasp vs39, vs4, vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmulsp vs32, vs0, vs8
+
+ xsmulsp vs33, vs0, vs9
+
+ xsmulsp vs34, vs0, vs10
+
+ xsmulsp vs35, vs0, vs11
+
+ xsmulsp vs36, vs0, vs12
+
+ xsmulsp vs37, vs0, vs13
+
+ xsmulsp vs38, vs0, vs14
+
+ xsmulsp vs39, vs0, vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi T1, T1, 16
+
+ lxsspx vs12, o0, T1
+ lxsspx vs13, o4, T1
+ lxsspx vs14, o8, T1
+ lxsspx vs15, o12, T1
+
+ addi BO, BO, 32
+
+
+ xsmaddasp vs32, vs0, vs8
+
+ xsmaddasp vs33, vs0, vs9
+
+ xsmaddasp vs34, vs0, vs10
+
+ xsmaddasp vs35, vs0, vs11
+
+ xsmaddasp vs36, vs0, vs12
+
+ xsmaddasp vs37, vs0, vs13
+
+ xsmaddasp vs38, vs0, vs14
+
+ xsmaddasp vs39, vs0, vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs34, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs34, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs35, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs35, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs36, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs36, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs37, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs37, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs38, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs38, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs39, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs39, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+ xvmaddasp vs40, vs4, vs18
+ xvmaddasp vs41, vs5, vs18
+ xvmaddasp vs42, vs6, vs18
+ xvmaddasp vs43, vs7, vs18
+
+ xvmaddasp vs44, vs4, vs19
+ xvmaddasp vs45, vs5, vs19
+ xvmaddasp vs46, vs6, vs19
+ xvmaddasp vs47, vs7, vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+ xvmulsp vs40, vs0, vs10
+ xvmulsp vs41, vs1, vs10
+ xvmulsp vs42, vs2, vs10
+ xvmulsp vs43, vs3, vs10
+
+ xvmulsp vs44, vs0, vs11
+ xvmulsp vs45, vs1, vs11
+ xvmulsp vs46, vs2, vs11
+ xvmulsp vs47, vs3, vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+ xvmaddasp vs40, vs0, vs10
+ xvmaddasp vs41, vs1, vs10
+ xvmaddasp vs42, vs2, vs10
+ xvmaddasp vs43, vs3, vs10
+
+ xvmaddasp vs44, vs0, vs11
+ xvmaddasp vs45, vs1, vs11
+ xvmaddasp vs46, vs2, vs11
+ xvmaddasp vs47, vs3, vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs40, alpha_vr
+ xvmulsp vs1, vs41, alpha_vr
+ xvmulsp vs2, vs42, alpha_vr
+ xvmulsp vs3, vs43, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs40, alpha_vr
+ xvmaddasp vs1, vs41, alpha_vr
+ xvmaddasp vs2, vs42, alpha_vr
+ xvmaddasp vs3, vs43, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs44, alpha_vr
+ xvmulsp vs1, vs45, alpha_vr
+ xvmulsp vs2, vs46, alpha_vr
+ xvmulsp vs3, vs47, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs44, alpha_vr
+ xvmaddasp vs1, vs45, alpha_vr
+ xvmaddasp vs2, vs46, alpha_vr
+ xvmaddasp vs3, vs47, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+ xvmaddasp vs36, vs4, vs18
+ xvmaddasp vs37, vs5, vs18
+
+ xvmaddasp vs38, vs4, vs19
+ xvmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+ xvmulsp vs36, vs0, vs10
+ xvmulsp vs37, vs1, vs10
+
+ xvmulsp vs38, vs0, vs11
+ xvmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+ xvmaddasp vs36, vs0, vs10
+ xvmaddasp vs37, vs1, vs10
+
+ xvmaddasp vs38, vs0, vs11
+ xvmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs38, alpha_vr
+ xvmulsp vs1, vs39, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs38, alpha_vr
+ xvmaddasp vs1, vs39, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+ xxspltw vs18, vs28, 2
+ xxspltw vs19, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+ xvmaddasp vs34, vs4, vs18
+
+ xvmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+ xvmulsp vs34, vs0, vs10
+
+ xvmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+ xxspltw vs10, vs28, 2
+ xxspltw vs11, vs28, 3
+
+ addi BO, BO, 16
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+ xvmaddasp vs34, vs0, vs10
+
+ xvmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs34, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs34, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+ xsmulsp vs34, vs0, vs9
+ xsmulsp vs35, vs1, vs9
+
+ xsmulsp vs36, vs0, vs10
+ xsmulsp vs37, vs1, vs10
+
+ xsmulsp vs38, vs0, vs11
+ xsmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+ xsmaddasp vs34, vs0, vs9
+ xsmaddasp vs35, vs1, vs9
+
+ xsmaddasp vs36, vs0, vs10
+ xsmaddasp vs37, vs1, vs10
+
+ xsmaddasp vs38, vs0, vs11
+ xsmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+ xsmaddasp vs34, vs4, vs17
+ xsmaddasp vs35, vs5, vs17
+
+ xsmaddasp vs36, vs4, vs18
+ xsmaddasp vs37, vs5, vs18
+
+ xsmaddasp vs38, vs4, vs19
+ xsmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+ xsmaddasp vs34, vs4, vs17
+ xsmaddasp vs35, vs5, vs17
+
+ xsmaddasp vs36, vs4, vs18
+ xsmaddasp vs37, vs5, vs18
+
+ xsmaddasp vs38, vs4, vs19
+ xsmaddasp vs39, vs5, vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+ xsmulsp vs34, vs0, vs9
+ xsmulsp vs35, vs1, vs9
+
+ xsmulsp vs36, vs0, vs10
+ xsmulsp vs37, vs1, vs10
+
+ xsmulsp vs38, vs0, vs11
+ xsmulsp vs39, vs1, vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+ xsmaddasp vs34, vs0, vs9
+ xsmaddasp vs35, vs1, vs9
+
+ xsmaddasp vs36, vs0, vs10
+ xsmaddasp vs37, vs1, vs10
+
+ xsmaddasp vs38, vs0, vs11
+ xsmaddasp vs39, vs1, vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+ xsmulsp vs1, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+ xsmaddasp vs1, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs34, alpha_r
+ xsmulsp vs1, vs35, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs34, alpha_r
+ xsmaddasp vs1, vs35, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs36, alpha_r
+ xsmulsp vs1, vs37, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs36, alpha_r
+ xsmaddasp vs1, vs37, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs38, alpha_r
+ xsmulsp vs1, vs39, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs38, alpha_r
+ xsmaddasp vs1, vs39, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmulsp vs32, vs0, vs8
+
+ xsmulsp vs33, vs0, vs9
+
+ xsmulsp vs34, vs0, vs10
+
+ xsmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+ lxsspx vs18, o8, T1
+ lxsspx vs19, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddasp vs32, vs0, vs8
+
+ xsmaddasp vs33, vs0, vs9
+
+ xsmaddasp vs34, vs0, vs10
+
+ xsmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddasp vs32, vs4, vs16
+
+ xsmaddasp vs33, vs4, vs17
+
+ xsmaddasp vs34, vs4, vs18
+
+ xsmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+
+ xsmaddasp vs33, vs4, vs17
+
+ xsmaddasp vs34, vs4, vs18
+
+ xsmaddasp vs35, vs4, vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmulsp vs32, vs0, vs8
+
+ xsmulsp vs33, vs0, vs9
+
+ xsmulsp vs34, vs0, vs10
+
+ xsmulsp vs35, vs0, vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+ lxsspx vs10, o8, T1
+ lxsspx vs11, o12, T1
+
+ addi BO, BO, 16
+
+
+ xsmaddasp vs32, vs0, vs8
+
+ xsmaddasp vs33, vs0, vs9
+
+ xsmaddasp vs34, vs0, vs10
+
+ xsmaddasp vs35, vs0, vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs34, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs34, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs35, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs35, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+ xvmaddasp vs36, vs4, vs17
+ xvmaddasp vs37, vs5, vs17
+ xvmaddasp vs38, vs6, vs17
+ xvmaddasp vs39, vs7, vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+ xvmulsp vs36, vs0, vs9
+ xvmulsp vs37, vs1, vs9
+ xvmulsp vs38, vs2, vs9
+ xvmulsp vs39, vs3, vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+ xvmaddasp vs36, vs0, vs9
+ xvmaddasp vs37, vs1, vs9
+ xvmaddasp vs38, vs2, vs9
+ xvmaddasp vs39, vs3, vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs36, alpha_vr
+ xvmulsp vs1, vs37, alpha_vr
+ xvmulsp vs2, vs38, alpha_vr
+ xvmulsp vs3, vs39, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs36, alpha_vr
+ xvmaddasp vs1, vs37, alpha_vr
+ xvmaddasp vs2, vs38, alpha_vr
+ xvmaddasp vs3, vs39, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+ xvmaddasp vs34, vs4, vs17
+ xvmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+ xvmulsp vs34, vs0, vs9
+ xvmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+ xvmaddasp vs34, vs0, vs9
+ xvmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs34, alpha_vr
+ xvmulsp vs1, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs34, alpha_vr
+ xvmaddasp vs1, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+ xxspltw vs17, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+ xvmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmulsp vs32, vs0, vs8
+
+ xvmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+ xxspltw vs9, vs28, 1
+
+ addi BO, BO, 8
+
+
+ xvmaddasp vs32, vs0, vs8
+
+ xvmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+ xsmulsp vs34, vs0, vs9
+ xsmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+ xsmaddasp vs34, vs0, vs9
+ xsmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+ xsmaddasp vs34, vs4, vs17
+ xsmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+ xsmaddasp vs34, vs4, vs17
+ xsmaddasp vs35, vs5, vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+ xsmulsp vs34, vs0, vs9
+ xsmulsp vs35, vs1, vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+ xsmaddasp vs34, vs0, vs9
+ xsmaddasp vs35, vs1, vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+ xsmulsp vs1, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+ xsmaddasp vs1, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs34, alpha_r
+ xsmulsp vs1, vs35, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs34, alpha_r
+ xsmaddasp vs1, vs35, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmulsp vs32, vs0, vs8
+
+ xsmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+ lxsspx vs17, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddasp vs32, vs0, vs8
+
+ xsmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddasp vs32, vs4, vs16
+
+ xsmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+
+ xsmaddasp vs33, vs4, vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmulsp vs32, vs0, vs8
+
+ xsmulsp vs33, vs0, vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+ lxsspx vs9, o4, T1
+
+ addi BO, BO, 8
+
+
+ xsmaddasp vs32, vs0, vs8
+
+ xsmaddasp vs33, vs0, vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+ lxvw4x vs6, o32, AO
+ lxvw4x vs7, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+ xvmaddasp vs34, vs6, vs16
+ xvmaddasp vs35, vs7, vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+ xvmulsp vs34, vs2, vs8
+ xvmulsp vs35, vs3, vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+ lxvw4x vs2, o32, AO
+ lxvw4x vs3, o48, AO
+
+ addi AO, AO, 64
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+ xvmaddasp vs34, vs2, vs8
+ xvmaddasp vs35, vs3, vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+ lxvw4x vs2, o32, T1
+ lxvw4x vs3, o48, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+ xvmulsp vs2, vs34, alpha_vr
+ xvmulsp vs3, vs35, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+ xvmaddasp vs2, vs34, alpha_vr
+ xvmaddasp vs3, vs35, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+ stxvw4x vs2, o32, T1
+ stxvw4x vs3, o48, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+ lxvw4x vs4, o0, AO
+ lxvw4x vs5, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+ xvmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+ xvmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+ lxvw4x vs0, o0, AO
+ lxvw4x vs1, o16, AO
+
+ addi AO, AO, 32
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+ xvmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+ lxvw4x vs1, o16, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+ xvmulsp vs1, vs33, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+ xvmaddasp vs1, vs33, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+ stxvw4x vs1, o16, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+ lxvw4x vs4, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs16, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+ xvmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+ lxvw4x vs0, o0, AO
+
+ addi AO, AO, 16
+
+ lxvw4x vs28, o0, BO
+
+ xxspltw vs8, vs28, 0
+
+ addi BO, BO, 4
+
+
+ xvmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxvw4x vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xvmulsp vs0, vs32, alpha_vr
+
+#else
+
+ xvmaddasp vs0, vs32, alpha_vr
+
+#endif
+
+ stxvw4x vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+ lxsspx vs4, o0, AO
+ lxsspx vs5, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+ xsmaddasp vs33, vs5, vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmulsp vs32, vs0, vs8
+ xsmulsp vs33, vs1, vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+ lxsspx vs0, o0, AO
+ lxsspx vs1, o4, AO
+
+ addi AO, AO, 8
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddasp vs32, vs0, vs8
+ xsmaddasp vs33, vs1, vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+ lxsspx vs1, o4, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+ xsmulsp vs1, vs33, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+ xsmaddasp vs1, vs33, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+ stxsspx vs1, o4, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+ lxsspx vs4, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs16, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+ xsmaddasp vs32, vs4, vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmulsp vs32, vs0, vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+ lxsspx vs0, o0, AO
+
+ addi AO, AO, 4
+
+ mr T1, BO
+
+ lxsspx vs8, o0, T1
+
+ addi BO, BO, 4
+
+
+ xsmaddasp vs32, vs0, vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+ mr T1, CO
+
+#ifndef TRMMKERNEL
+
+ lxsspx vs0, o0, T1
+
+#endif
+
+#ifdef TRMMKERNEL
+
+ xsmulsp vs0, vs32, alpha_r
+
+#else
+
+ xsmaddasp vs0, vs32, alpha_r
+
+#endif
+
+ stxsspx vs0, o0, T1
+
+ add T1, T1, LDC
+
+ addi CO, CO, 4
+
+.endm
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/14 Werner Saar (wernsaar@googlemail.com)
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+**************************************************************************************/
+
+
+
+ srawi. J, N, 3
+ ble .LSTRMM_L8_END
+
+.LSTRMM_L8_BEGIN:
+
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 3
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LSTRMM_L8x16_END
+
+.LSTRMM_L8x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L8x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L8x16_SUB4
+
+.LSTRMM_L8x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_I1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L8x16_LOOP_END
+
+ .align 5
+
+.LSTRMM_L8x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x16_LOOP
+
+.LSTRMM_L8x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+
+ dcbt AO, PRE
+ KERNEL8x16_1
+ dcbt AO, PRE
+ KERNEL8x16_2
+ dcbt AO, PRE
+ KERNEL8x16_1
+ KERNEL8x16_E2
+
+ b .LSTRMM_L8x16_SUB1
+
+.LSTRMM_L8x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL8x16_SUBI1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+ dcbt AO, PRE
+ KERNEL8x16_SUB1
+
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+ KERNEL8x16_SUB1
+
+ b .LSTRMM_L8x16_SUB1
+
+.LSTRMM_L8x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x16_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L8x16_SAVE
+ b .LSTRMM_L8x16_SUB2
+
+.LSTRMM_L8x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L8x16_SAVE
+
+.LSTRMM_L8x16_SUB2:
+
+ KERNEL8x16_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x16_SUB2
+
+.LSTRMM_L8x16_SAVE:
+
+ SAVE8x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LSTRMM_L8x16_BEGIN
+
+.LSTRMM_L8x16_END:
+
+.LSTRMM_L8x8_BEGIN:
+ andi. T2, M, 15
+ ble .LSTRMM_L8x1_END
+
+ andi. T1, M, 8
+ ble .LSTRMM_L8x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L8x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L8x8_SUB4
+
+.LSTRMM_L8x8_LOOP_START:
+
+ LOAD8x8_1
+ KERNEL8x8_I1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L8x8_LOOP_END
+
+ .align 5
+
+.LSTRMM_L8x8_LOOP:
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x8_LOOP
+
+.LSTRMM_L8x8_LOOP_END:
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_2
+
+ KERNEL8x8_1
+ KERNEL8x8_2
+ KERNEL8x8_1
+ KERNEL8x8_E2
+
+ b .LSTRMM_L8x8_SUB1
+
+.LSTRMM_L8x8_SUB4:
+
+ KERNEL8x8_SUBI1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+ KERNEL8x8_SUB1
+
+ b .LSTRMM_L8x8_SUB1
+
+.LSTRMM_L8x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x8_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L8x8_SAVE
+ b .LSTRMM_L8x8_SUB2
+
+.LSTRMM_L8x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L8x8_SAVE
+
+.LSTRMM_L8x8_SUB2:
+
+ KERNEL8x8_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x8_SUB2
+
+.LSTRMM_L8x8_SAVE:
+
+ SAVE8x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x8_END:
+
+.LSTRMM_L8x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LSTRMM_L8x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L8x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L8x4_SUB4
+
+.LSTRMM_L8x4_LOOP_START:
+
+ LOAD8x4_1
+ KERNEL8x4_I1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L8x4_LOOP_END
+
+ .align 5
+
+.LSTRMM_L8x4_LOOP:
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x4_LOOP
+
+.LSTRMM_L8x4_LOOP_END:
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_2
+
+ KERNEL8x4_1
+ KERNEL8x4_2
+ KERNEL8x4_1
+ KERNEL8x4_E2
+
+ b .LSTRMM_L8x4_SUB1
+
+.LSTRMM_L8x4_SUB4:
+
+ KERNEL8x4_SUBI1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+ KERNEL8x4_SUB1
+
+ b .LSTRMM_L8x4_SUB1
+
+.LSTRMM_L8x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x4_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L8x4_SAVE
+ b .LSTRMM_L8x4_SUB2
+
+.LSTRMM_L8x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L8x4_SAVE
+
+.LSTRMM_L8x4_SUB2:
+
+ KERNEL8x4_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x4_SUB2
+
+.LSTRMM_L8x4_SAVE:
+
+ SAVE8x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x4_END:
+
+.LSTRMM_L8x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LSTRMM_L8x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L8x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L8x2_SUB4
+
+.LSTRMM_L8x2_LOOP_START:
+
+ LOAD8x2_1
+ KERNEL8x2_I1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L8x2_LOOP_END
+
+ .align 5
+
+.LSTRMM_L8x2_LOOP:
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x2_LOOP
+
+.LSTRMM_L8x2_LOOP_END:
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_2
+
+ KERNEL8x2_1
+ KERNEL8x2_2
+ KERNEL8x2_1
+ KERNEL8x2_E2
+
+ b .LSTRMM_L8x2_SUB1
+
+.LSTRMM_L8x2_SUB4:
+
+ KERNEL8x2_SUBI1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+ KERNEL8x2_SUB1
+
+ b .LSTRMM_L8x2_SUB1
+
+.LSTRMM_L8x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x2_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L8x2_SAVE
+ b .LSTRMM_L8x2_SUB2
+
+.LSTRMM_L8x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L8x2_SAVE
+
+.LSTRMM_L8x2_SUB2:
+
+ KERNEL8x2_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x2_SUB2
+
+.LSTRMM_L8x2_SAVE:
+
+ SAVE8x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x2_END:
+
+.LSTRMM_L8x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LSTRMM_L8x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 5 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L8x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L8x1_SUB4
+
+.LSTRMM_L8x1_LOOP_START:
+
+ LOAD8x1_1
+ KERNEL8x1_I1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L8x1_LOOP_END
+
+ .align 5
+
+.LSTRMM_L8x1_LOOP:
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x1_LOOP
+
+.LSTRMM_L8x1_LOOP_END:
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_2
+
+ KERNEL8x1_1
+ KERNEL8x1_2
+ KERNEL8x1_1
+ KERNEL8x1_E2
+
+ b .LSTRMM_L8x1_SUB1
+
+.LSTRMM_L8x1_SUB4:
+
+ KERNEL8x1_SUBI1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+ KERNEL8x1_SUB1
+
+ b .LSTRMM_L8x1_SUB1
+
+.LSTRMM_L8x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL8x1_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L8x1_SAVE
+ b .LSTRMM_L8x1_SUB2
+
+.LSTRMM_L8x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L8x1_SAVE
+
+.LSTRMM_L8x1_SUB2:
+
+ KERNEL8x1_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L8x1_SUB2
+
+.LSTRMM_L8x1_SAVE:
+
+ SAVE8x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L8x1_END:
+
+ slwi T1, K, 5
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in B
+#endif
+
+
+ addic. J, J, -1
+ bgt .LSTRMM_L8_BEGIN
+
+ andi. T2, N, 7
+ ble .L999
+
+.LSTRMM_L8_END:
+
+ b .LSTRMM_L4_BEGIN
+
+.L999_H1:
+
+ b .L999
+
+.LSTRMM_L4_BEGIN:
+
+ andi. T1, N, 4
+ ble .LSTRMM_L4_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 2
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LSTRMM_L4x16_END
+
+.LSTRMM_L4x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L4x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L4x16_SUB4
+
+.LSTRMM_L4x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_I1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L4x16_LOOP_END
+
+ .align 5
+
+.LSTRMM_L4x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x16_LOOP
+
+.LSTRMM_L4x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+
+ dcbt AO, PRE
+ KERNEL4x16_1
+ dcbt AO, PRE
+ KERNEL4x16_2
+ dcbt AO, PRE
+ KERNEL4x16_1
+ KERNEL4x16_E2
+
+ b .LSTRMM_L4x16_SUB1
+
+.LSTRMM_L4x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL4x16_SUBI1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+ dcbt AO, PRE
+ KERNEL4x16_SUB1
+
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+ KERNEL4x16_SUB1
+
+ b .LSTRMM_L4x16_SUB1
+
+.LSTRMM_L4x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x16_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L4x16_SAVE
+ b .LSTRMM_L4x16_SUB2
+
+.LSTRMM_L4x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L4x16_SAVE
+
+.LSTRMM_L4x16_SUB2:
+
+ KERNEL4x16_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x16_SUB2
+
+.LSTRMM_L4x16_SAVE:
+
+ SAVE4x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LSTRMM_L4x16_BEGIN
+
+.LSTRMM_L4x16_END:
+
+.LSTRMM_L4x8_BEGIN:
+ andi. T2, M, 15
+ ble .LSTRMM_L4x1_END
+
+ andi. T1, M, 8
+ ble .LSTRMM_L4x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L4x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L4x8_SUB4
+
+.LSTRMM_L4x8_LOOP_START:
+
+ LOAD4x8_1
+ KERNEL4x8_I1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L4x8_LOOP_END
+
+ .align 5
+
+.LSTRMM_L4x8_LOOP:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x8_LOOP
+
+.LSTRMM_L4x8_LOOP_END:
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_2
+
+ KERNEL4x8_1
+ KERNEL4x8_2
+ KERNEL4x8_1
+ KERNEL4x8_E2
+
+ b .LSTRMM_L4x8_SUB1
+
+.LSTRMM_L4x8_SUB4:
+
+ KERNEL4x8_SUBI1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+ KERNEL4x8_SUB1
+
+ b .LSTRMM_L4x8_SUB1
+
+.LSTRMM_L4x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x8_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L4x8_SAVE
+ b .LSTRMM_L4x8_SUB2
+
+.LSTRMM_L4x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L4x8_SAVE
+
+.LSTRMM_L4x8_SUB2:
+
+ KERNEL4x8_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x8_SUB2
+
+.LSTRMM_L4x8_SAVE:
+
+ SAVE4x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x8_END:
+
+.LSTRMM_L4x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LSTRMM_L4x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L4x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L4x4_SUB4
+
+.LSTRMM_L4x4_LOOP_START:
+
+ LOAD4x4_1
+ KERNEL4x4_I1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L4x4_LOOP_END
+
+ .align 5
+
+.LSTRMM_L4x4_LOOP:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x4_LOOP
+
+.LSTRMM_L4x4_LOOP_END:
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_2
+
+ KERNEL4x4_1
+ KERNEL4x4_2
+ KERNEL4x4_1
+ KERNEL4x4_E2
+
+ b .LSTRMM_L4x4_SUB1
+
+.LSTRMM_L4x4_SUB4:
+
+ KERNEL4x4_SUBI1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+ KERNEL4x4_SUB1
+
+ b .LSTRMM_L4x4_SUB1
+
+.LSTRMM_L4x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x4_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L4x4_SAVE
+ b .LSTRMM_L4x4_SUB2
+
+.LSTRMM_L4x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L4x4_SAVE
+
+.LSTRMM_L4x4_SUB2:
+
+ KERNEL4x4_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x4_SUB2
+
+.LSTRMM_L4x4_SAVE:
+
+ SAVE4x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x4_END:
+
+.LSTRMM_L4x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LSTRMM_L4x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L4x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L4x2_SUB4
+
+.LSTRMM_L4x2_LOOP_START:
+
+ LOAD4x2_1
+ KERNEL4x2_I1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L4x2_LOOP_END
+
+ .align 5
+
+.LSTRMM_L4x2_LOOP:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x2_LOOP
+
+.LSTRMM_L4x2_LOOP_END:
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_2
+
+ KERNEL4x2_1
+ KERNEL4x2_2
+ KERNEL4x2_1
+ KERNEL4x2_E2
+
+ b .LSTRMM_L4x2_SUB1
+
+.LSTRMM_L4x2_SUB4:
+
+ KERNEL4x2_SUBI1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+ KERNEL4x2_SUB1
+
+ b .LSTRMM_L4x2_SUB1
+
+.LSTRMM_L4x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x2_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L4x2_SAVE
+ b .LSTRMM_L4x2_SUB2
+
+.LSTRMM_L4x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L4x2_SAVE
+
+.LSTRMM_L4x2_SUB2:
+
+ KERNEL4x2_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x2_SUB2
+
+.LSTRMM_L4x2_SAVE:
+
+ SAVE4x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x2_END:
+
+.LSTRMM_L4x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LSTRMM_L4x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 4 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L4x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L4x1_SUB4
+
+.LSTRMM_L4x1_LOOP_START:
+
+ LOAD4x1_1
+ KERNEL4x1_I1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L4x1_LOOP_END
+
+ .align 5
+
+.LSTRMM_L4x1_LOOP:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x1_LOOP
+
+.LSTRMM_L4x1_LOOP_END:
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_2
+
+ KERNEL4x1_1
+ KERNEL4x1_2
+ KERNEL4x1_1
+ KERNEL4x1_E2
+
+ b .LSTRMM_L4x1_SUB1
+
+.LSTRMM_L4x1_SUB4:
+
+ KERNEL4x1_SUBI1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+ KERNEL4x1_SUB1
+
+ b .LSTRMM_L4x1_SUB1
+
+.LSTRMM_L4x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL4x1_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L4x1_SAVE
+ b .LSTRMM_L4x1_SUB2
+
+.LSTRMM_L4x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L4x1_SAVE
+
+.LSTRMM_L4x1_SUB2:
+
+ KERNEL4x1_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L4x1_SUB2
+
+.LSTRMM_L4x1_SAVE:
+
+ SAVE4x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L4x1_END:
+
+ slwi T1, K, 4
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in B
+#endif
+
+
+.LSTRMM_L4_END:
+.LSTRMM_L2_BEGIN:
+
+ andi. T1, N, 2
+ ble .LSTRMM_L2_END
+ mr CO, C
+ mr AO, A
+ slwi T1, LDC , 1
+ add C, C, T1
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LSTRMM_L2x16_END
+
+.LSTRMM_L2x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L2x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L2x16_SUB4
+
+.LSTRMM_L2x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_I1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L2x16_LOOP_END
+
+ .align 5
+
+.LSTRMM_L2x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x16_LOOP
+
+.LSTRMM_L2x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+
+ dcbt AO, PRE
+ KERNEL2x16_1
+ dcbt AO, PRE
+ KERNEL2x16_2
+ dcbt AO, PRE
+ KERNEL2x16_1
+ KERNEL2x16_E2
+
+ b .LSTRMM_L2x16_SUB1
+
+.LSTRMM_L2x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL2x16_SUBI1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+ dcbt AO, PRE
+ KERNEL2x16_SUB1
+
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+ KERNEL2x16_SUB1
+
+ b .LSTRMM_L2x16_SUB1
+
+.LSTRMM_L2x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x16_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L2x16_SAVE
+ b .LSTRMM_L2x16_SUB2
+
+.LSTRMM_L2x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L2x16_SAVE
+
+.LSTRMM_L2x16_SUB2:
+
+ KERNEL2x16_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x16_SUB2
+
+.LSTRMM_L2x16_SAVE:
+
+ SAVE2x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LSTRMM_L2x16_BEGIN
+
+.LSTRMM_L2x16_END:
+
+.LSTRMM_L2x8_BEGIN:
+ andi. T2, M, 15
+ ble .LSTRMM_L2x1_END
+
+ andi. T1, M, 8
+ ble .LSTRMM_L2x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L2x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L2x8_SUB4
+
+.LSTRMM_L2x8_LOOP_START:
+
+ LOAD2x8_1
+ KERNEL2x8_I1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L2x8_LOOP_END
+
+ .align 5
+
+.LSTRMM_L2x8_LOOP:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x8_LOOP
+
+.LSTRMM_L2x8_LOOP_END:
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_2
+
+ KERNEL2x8_1
+ KERNEL2x8_2
+ KERNEL2x8_1
+ KERNEL2x8_E2
+
+ b .LSTRMM_L2x8_SUB1
+
+.LSTRMM_L2x8_SUB4:
+
+ KERNEL2x8_SUBI1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+ KERNEL2x8_SUB1
+
+ b .LSTRMM_L2x8_SUB1
+
+.LSTRMM_L2x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x8_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L2x8_SAVE
+ b .LSTRMM_L2x8_SUB2
+
+.LSTRMM_L2x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L2x8_SAVE
+
+.LSTRMM_L2x8_SUB2:
+
+ KERNEL2x8_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x8_SUB2
+
+.LSTRMM_L2x8_SAVE:
+
+ SAVE2x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x8_END:
+
+.LSTRMM_L2x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LSTRMM_L2x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L2x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L2x4_SUB4
+
+.LSTRMM_L2x4_LOOP_START:
+
+ LOAD2x4_1
+ KERNEL2x4_I1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L2x4_LOOP_END
+
+ .align 5
+
+.LSTRMM_L2x4_LOOP:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x4_LOOP
+
+.LSTRMM_L2x4_LOOP_END:
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_2
+
+ KERNEL2x4_1
+ KERNEL2x4_2
+ KERNEL2x4_1
+ KERNEL2x4_E2
+
+ b .LSTRMM_L2x4_SUB1
+
+.LSTRMM_L2x4_SUB4:
+
+ KERNEL2x4_SUBI1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+ KERNEL2x4_SUB1
+
+ b .LSTRMM_L2x4_SUB1
+
+.LSTRMM_L2x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x4_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L2x4_SAVE
+ b .LSTRMM_L2x4_SUB2
+
+.LSTRMM_L2x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L2x4_SAVE
+
+.LSTRMM_L2x4_SUB2:
+
+ KERNEL2x4_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x4_SUB2
+
+.LSTRMM_L2x4_SAVE:
+
+ SAVE2x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x4_END:
+
+.LSTRMM_L2x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LSTRMM_L2x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L2x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L2x2_SUB4
+
+.LSTRMM_L2x2_LOOP_START:
+
+ LOAD2x2_1
+ KERNEL2x2_I1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L2x2_LOOP_END
+
+ .align 5
+
+.LSTRMM_L2x2_LOOP:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x2_LOOP
+
+.LSTRMM_L2x2_LOOP_END:
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_2
+
+ KERNEL2x2_1
+ KERNEL2x2_2
+ KERNEL2x2_1
+ KERNEL2x2_E2
+
+ b .LSTRMM_L2x2_SUB1
+
+.LSTRMM_L2x2_SUB4:
+
+ KERNEL2x2_SUBI1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+ KERNEL2x2_SUB1
+
+ b .LSTRMM_L2x2_SUB1
+
+.LSTRMM_L2x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x2_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L2x2_SAVE
+ b .LSTRMM_L2x2_SUB2
+
+.LSTRMM_L2x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L2x2_SAVE
+
+.LSTRMM_L2x2_SUB2:
+
+ KERNEL2x2_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x2_SUB2
+
+.LSTRMM_L2x2_SAVE:
+
+ SAVE2x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x2_END:
+
+.LSTRMM_L2x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LSTRMM_L2x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 3 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L2x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L2x1_SUB4
+
+.LSTRMM_L2x1_LOOP_START:
+
+ LOAD2x1_1
+ KERNEL2x1_I1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L2x1_LOOP_END
+
+ .align 5
+
+.LSTRMM_L2x1_LOOP:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x1_LOOP
+
+.LSTRMM_L2x1_LOOP_END:
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_2
+
+ KERNEL2x1_1
+ KERNEL2x1_2
+ KERNEL2x1_1
+ KERNEL2x1_E2
+
+ b .LSTRMM_L2x1_SUB1
+
+.LSTRMM_L2x1_SUB4:
+
+ KERNEL2x1_SUBI1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+ KERNEL2x1_SUB1
+
+ b .LSTRMM_L2x1_SUB1
+
+.LSTRMM_L2x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL2x1_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L2x1_SAVE
+ b .LSTRMM_L2x1_SUB2
+
+.LSTRMM_L2x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L2x1_SAVE
+
+.LSTRMM_L2x1_SUB2:
+
+ KERNEL2x1_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L2x1_SUB2
+
+.LSTRMM_L2x1_SAVE:
+
+ SAVE2x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L2x1_END:
+
+ slwi T1, K, 3
+ add B, B, T1
+
+#if !defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in B
+#endif
+
+
+.LSTRMM_L2_END:
+.LSTRMM_L1_BEGIN:
+
+ andi. T1, N, 1
+ ble .LSTRMM_L1_END
+ mr CO, C
+ mr AO, A
+
+#if defined(LEFT)
+ mr KK, OFFSET // OFFSET -> KK
+#endif
+
+ srawi. I, M, 4
+ ble .LSTRMM_L1x16_END
+
+.LSTRMM_L1x16_BEGIN:
+
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 6 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L1x16_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L1x16_SUB4
+
+.LSTRMM_L1x16_LOOP_START:
+
+ dcbt AO, PRE
+ LOAD1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_I1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L1x16_LOOP_END
+
+ .align 5
+
+.LSTRMM_L1x16_LOOP:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x16_LOOP
+
+.LSTRMM_L1x16_LOOP_END:
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+
+ dcbt AO, PRE
+ KERNEL1x16_1
+ dcbt AO, PRE
+ KERNEL1x16_2
+ dcbt AO, PRE
+ KERNEL1x16_1
+ KERNEL1x16_E2
+
+ b .LSTRMM_L1x16_SUB1
+
+.LSTRMM_L1x16_SUB4:
+
+ dcbt AO, PRE
+ KERNEL1x16_SUBI1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+ dcbt AO, PRE
+ KERNEL1x16_SUB1
+
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+ KERNEL1x16_SUB1
+
+ b .LSTRMM_L1x16_SUB1
+
+.LSTRMM_L1x16_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x16_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L1x16_SAVE
+ b .LSTRMM_L1x16_SUB2
+
+.LSTRMM_L1x16_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L1x16_SAVE
+
+.LSTRMM_L1x16_SUB2:
+
+ KERNEL1x16_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x16_SUB2
+
+.LSTRMM_L1x16_SAVE:
+
+ SAVE1x16
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 16 // KK += Number of values in A
+#endif
+
+
+ addic. I, I, -1
+ bgt .LSTRMM_L1x16_BEGIN
+
+.LSTRMM_L1x16_END:
+
+.LSTRMM_L1x8_BEGIN:
+ andi. T2, M, 15
+ ble .LSTRMM_L1x1_END
+
+ andi. T1, M, 8
+ ble .LSTRMM_L1x8_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 5 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L1x8_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L1x8_SUB4
+
+.LSTRMM_L1x8_LOOP_START:
+
+ LOAD1x8_1
+ KERNEL1x8_I1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L1x8_LOOP_END
+
+ .align 5
+
+.LSTRMM_L1x8_LOOP:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x8_LOOP
+
+.LSTRMM_L1x8_LOOP_END:
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_2
+
+ KERNEL1x8_1
+ KERNEL1x8_2
+ KERNEL1x8_1
+ KERNEL1x8_E2
+
+ b .LSTRMM_L1x8_SUB1
+
+.LSTRMM_L1x8_SUB4:
+
+ KERNEL1x8_SUBI1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+ KERNEL1x8_SUB1
+
+ b .LSTRMM_L1x8_SUB1
+
+.LSTRMM_L1x8_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x8_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L1x8_SAVE
+ b .LSTRMM_L1x8_SUB2
+
+.LSTRMM_L1x8_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L1x8_SAVE
+
+.LSTRMM_L1x8_SUB2:
+
+ KERNEL1x8_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x8_SUB2
+
+.LSTRMM_L1x8_SAVE:
+
+ SAVE1x8
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 8 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x8_END:
+
+.LSTRMM_L1x4_BEGIN:
+
+ andi. T1, M, 4
+ ble .LSTRMM_L1x4_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 4 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L1x4_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L1x4_SUB4
+
+.LSTRMM_L1x4_LOOP_START:
+
+ LOAD1x4_1
+ KERNEL1x4_I1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L1x4_LOOP_END
+
+ .align 5
+
+.LSTRMM_L1x4_LOOP:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x4_LOOP
+
+.LSTRMM_L1x4_LOOP_END:
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_2
+
+ KERNEL1x4_1
+ KERNEL1x4_2
+ KERNEL1x4_1
+ KERNEL1x4_E2
+
+ b .LSTRMM_L1x4_SUB1
+
+.LSTRMM_L1x4_SUB4:
+
+ KERNEL1x4_SUBI1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+ KERNEL1x4_SUB1
+
+ b .LSTRMM_L1x4_SUB1
+
+.LSTRMM_L1x4_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x4_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L1x4_SAVE
+ b .LSTRMM_L1x4_SUB2
+
+.LSTRMM_L1x4_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L1x4_SAVE
+
+.LSTRMM_L1x4_SUB2:
+
+ KERNEL1x4_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x4_SUB2
+
+.LSTRMM_L1x4_SAVE:
+
+ SAVE1x4
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 4 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x4_END:
+
+.LSTRMM_L1x2_BEGIN:
+
+ andi. T1, M, 2
+ ble .LSTRMM_L1x2_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 3 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L1x2_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L1x2_SUB4
+
+.LSTRMM_L1x2_LOOP_START:
+
+ LOAD1x2_1
+ KERNEL1x2_I1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L1x2_LOOP_END
+
+ .align 5
+
+.LSTRMM_L1x2_LOOP:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x2_LOOP
+
+.LSTRMM_L1x2_LOOP_END:
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_2
+
+ KERNEL1x2_1
+ KERNEL1x2_2
+ KERNEL1x2_1
+ KERNEL1x2_E2
+
+ b .LSTRMM_L1x2_SUB1
+
+.LSTRMM_L1x2_SUB4:
+
+ KERNEL1x2_SUBI1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+ KERNEL1x2_SUB1
+
+ b .LSTRMM_L1x2_SUB1
+
+.LSTRMM_L1x2_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x2_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L1x2_SAVE
+ b .LSTRMM_L1x2_SUB2
+
+.LSTRMM_L1x2_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L1x2_SAVE
+
+.LSTRMM_L1x2_SUB2:
+
+ KERNEL1x2_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x2_SUB2
+
+.LSTRMM_L1x2_SAVE:
+
+ SAVE1x2
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 2 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x2_END:
+
+.LSTRMM_L1x1_BEGIN:
+
+ andi. T1, M, 1
+ ble .LSTRMM_L1x1_END
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ mr BO, B // B -> BO
+#else
+ mr BO, B // B -> BO
+ slwi T1, KK, 2 // Number of values in B shifted
+ slwi T2, KK, 2 // Number of values in A shifted
+ add BO, BO, T1 // Add values to BO
+ add AO, AO, T2 // Add values to AO
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ sub T1, K, KK // K - KK -> TEMP1
+#else
+ mr T1, KK // KK -> KTEMP
+#ifdef LEFT
+ addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP
+#else
+ addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+ mr KKK, T1
+ mr K1, T1
+ srawi. L, K1, 3 // KTEMP / 8 -> L
+ ble .LSTRMM_L1x1_SUB0
+ cmpwi cr0, L, 1
+ ble .LSTRMM_L1x1_SUB4
+
+.LSTRMM_L1x1_LOOP_START:
+
+ LOAD1x1_1
+ KERNEL1x1_I1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -2
+ ble .LSTRMM_L1x1_LOOP_END
+
+ .align 5
+
+.LSTRMM_L1x1_LOOP:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x1_LOOP
+
+.LSTRMM_L1x1_LOOP_END:
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_2
+
+ KERNEL1x1_1
+ KERNEL1x1_2
+ KERNEL1x1_1
+ KERNEL1x1_E2
+
+ b .LSTRMM_L1x1_SUB1
+
+.LSTRMM_L1x1_SUB4:
+
+ KERNEL1x1_SUBI1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+ KERNEL1x1_SUB1
+
+ b .LSTRMM_L1x1_SUB1
+
+.LSTRMM_L1x1_SUB0:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+
+ KERNEL1x1_SUBI1
+
+ addic. L, L, -1
+ ble .LSTRMM_L1x1_SAVE
+ b .LSTRMM_L1x1_SUB2
+
+.LSTRMM_L1x1_SUB1:
+
+ andi. L, K1, 7 // K1 & 7 -> L
+ ble .LSTRMM_L1x1_SAVE
+
+.LSTRMM_L1x1_SUB2:
+
+ KERNEL1x1_SUB1
+
+ addic. L, L, -1
+ bgt .LSTRMM_L1x1_SUB2
+
+.LSTRMM_L1x1_SAVE:
+
+ SAVE1x1
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ sub T1, K, KKK // K - KKK -> TEMP1
+ slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2
+ slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1
+ add BO, BO, T2 // BO += TEMP2 * number of values in B shifted
+ add AO, AO, T1 // AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in A
+#endif
+
+
+.LSTRMM_L1x1_END:
+
+#if !defined(LEFT)
+ addi KK, KK, 1 // KK += Number of values in B
+#endif
+
+
+.LSTRMM_L1_END: