--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019 The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+ \r
+\r
+#define MY_ALIGN .align 3\r
+\r
+#if defined(TRMMKERNEL) && !defined(LEFT)\r
+ neg TEMP_REG, OFFSET \r
+#endif\r
+\r
+ srawi. J, N, 2\r
+ ble LDGEMM_L4_END\r
+\r
+LDGEMM_L4_BEGIN:\r
+\r
+ \r
+ li T1, 128\r
+ li T2, 256\r
+ \r
+ mr AO, A\r
+ mr CO, C\r
+ slwi T3, LDC , 2\r
+ add C, C, T3\r
+\r
+ \r
+ dcbt A, T1\r
+ dcbt A, T2\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ srawi. I, M, 4\r
+ ble LDGEMM_L4x16_END\r
+\r
+ MY_ALIGN\r
+LDGEMM_L4x16_BEGIN:\r
+\r
+ li L, -128\r
+\r
+\r
+ SAVE4x16_REGS\r
+\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4\r
+#else\r
+ mr BO, B\r
+#endif \r
+ \r
+ and T1, CO, L\r
+ and T2, C2, L\r
+ and T3, C3, L\r
+ and T4, C4, L\r
+\r
+ dcbt T1, r0\r
+ dcbt T2, r0\r
+ dcbt T3, r0\r
+ dcbt T4, r0\r
+ \r
+\r
+ addi T1, T1, 128\r
+ addi T2, T2, 128\r
+ addi T3, T3, 128\r
+ addi T4, T4, 128\r
+\r
+ dcbt T1, r0\r
+ dcbt T2, r0\r
+ dcbt T3, r0\r
+ dcbt T4, r0\r
+\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,16,4\r
+ srawi. L, T3, 5\r
+#else\r
+ srawi. L, K, 5\r
+#endif \r
+ \r
+ ble LDGEMM_L4x16_SUB0\r
+\r
+\r
+ MY_ALIGN\r
+LDGEMM_L4x16_LOOP_START:\r
+\r
+ li T2, 512\r
+ \r
+ \r
+ LOAD4x16_1\r
+ ##OffsetA=128 OffsetB=32\r
+ addi AO,AO,2176\r
+ # addi BO,BO,32 \r
+ addic. L, L, -1\r
+\r
+ ble LDGEMM_L4x16_LOOP_END\r
+\r
+ \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L4x16_LOOP:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL4x16_I1_L2_2 -2048,32, 0,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 1,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 2,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 3,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 4,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 5,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 6,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 7,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 8,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 9,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 10,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 11,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 12,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 13,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 14,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 15,1 \r
+\r
+\r
+ bdnz LDGEMM_L4x16_LOOP\r
+\r
+ MY_ALIGN\r
+ MY_ALIGN\r
+LDGEMM_L4x16_LOOP_END:\r
+\r
+ KERNEL4x16_I1_L2_2 -2048,32, 0,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 1,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 2,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 3,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 4,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 5,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 6,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 7,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 8,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 9,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 10,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 11,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 12,0\r
+ KERNEL4x16_I1_L2_2 -2048,32, 13,0 \r
+ KERNEL4x16_I1_L2_2 -2048,32, 14,0 \r
+ KERNEL4x16_I1_L2_3 -2048,32, 15,1 \r
+ b LDGEMM_L4x16_SUB1\r
+\r
+\r
+ MY_ALIGN\r
+LDGEMM_L4x16_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 31\r
+#else\r
+ andi. L, K, 31\r
+#endif\r
+ KERNEL4x16 1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L4x16_SAVE\r
+ b LDGEMM_L4x16_SUB2\r
+ MY_ALIGN\r
+LDGEMM_L4x16_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 31\r
+#else\r
+ andi. L, K, 31\r
+#endif \r
+ ble LDGEMM_L4x16_SAVE\r
+ MY_ALIGN\r
+LDGEMM_L4x16_SUB2:\r
+\r
+ andi. T1,L, 16\r
+ ble LDGEMM_L4x16_SUB2_8\r
+ LOAD4x16_0 \r
+ KERNEL4x16_I1_L2_2 128,32, 0,0\r
+ KERNEL4x16_I1_L2_2 128,32, 1,0\r
+ KERNEL4x16_I1_L2_2 128,32, 2,0\r
+ KERNEL4x16_I1_L2_2 128,32, 3,0\r
+ KERNEL4x16_I1_L2_2 128,32, 4,0\r
+ KERNEL4x16_I1_L2_2 128,32, 5,0 \r
+ KERNEL4x16_I1_L2_2 128,32, 6,0\r
+ KERNEL4x16_I1_L2_3 128,32, 7,1 \r
+ MY_ALIGN\r
+LDGEMM_L4x16_SUB2_8:\r
+ andi. T1,L, 8\r
+ ble LDGEMM_L4x16_SUB2_4\r
+ LOAD4x16_0\r
+ KERNEL4x16_I1_L2_2 128,32, 0,0\r
+ KERNEL4x16_I1_L2_2 128,32, 1,0\r
+ KERNEL4x16_I1_L2_2 128,32, 2,0\r
+ KERNEL4x16_I1_L2_3 128,32, 3,1\r
+ MY_ALIGN\r
+LDGEMM_L4x16_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble LDGEMM_L4x16_SUB2_2 \r
+ LOAD4x16_0\r
+ KERNEL4x16_I1_L2_2 128,32, 0,0\r
+ KERNEL4x16_I1_L2_3 128,32, 1,1\r
+ MY_ALIGN \r
+LDGEMM_L4x16_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble LDGEMM_L4x16_SUB2_1\r
+ LOAD4x16_0\r
+ KERNEL4x16_I1_L2_3 128,32, 0,1\r
+ MY_ALIGN\r
+LDGEMM_L4x16_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble LDGEMM_L4x16_SAVE \r
+ KERNEL4x16 0\r
+# addic. L, L, -1\r
+# bgt LDGEMM_L4x16_SUB2\r
+\r
+ MY_ALIGN\r
+LDGEMM_L4x16_SAVE:\r
+ SAVE4x16\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4\r
+#endif \r
+ addic. I, I, -1\r
+ bgt+ LDGEMM_L4x16_BEGIN\r
+\r
+LDGEMM_L4x16_END:\r
+\r
+LDGEMM_L4x8_BEGIN:\r
+\r
+ andi. T2, M, 15\r
+ ble LDGEMM_L4x1_END\r
+\r
+ andi. T1, M, 8\r
+ ble LDGEMM_L4x8_END\r
+\r
+\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,8,4\r
+ srawi. L, T3, 4 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 4 \r
+#endif \r
+ \r
+\r
+ ble LDGEMM_L4x8_SUB0\r
+\r
+LDGEMM_L4x8_LOOP_START:\r
+\r
+\r
+ LOAD4x8_1\r
+ ##OffsetA=64 OffsetB=32\r
+\r
+\r
+ addic. L, L, -1\r
+\r
+ ble LDGEMM_L4x8_LOOP_END\r
+\r
+ mtctr L\r
+ MY_ALIGN\r
+\r
+LDGEMM_L4x8_LOOP:\r
+\r
+ KERNEL4x8_I1_L2_2 64,32, 0,0\r
+ KERNEL4x8_I1_L2_2 64,32, 1,0\r
+ KERNEL4x8_I1_L2_2 64,32, 2,0\r
+ KERNEL4x8_I1_L2_2 64,32, 3,0\r
+ KERNEL4x8_I1_L2_2 64,32, 4,0\r
+ KERNEL4x8_I1_L2_2 64,32, 5,0 \r
+ KERNEL4x8_I1_L2_2 64,32, 6,0\r
+ KERNEL4x8_I1_L2_2 64,32, 7,1 \r
+\r
+ bdnz LDGEMM_L4x8_LOOP\r
+ MY_ALIGN\r
+LDGEMM_L4x8_LOOP_END:\r
+\r
+ KERNEL4x8_I1_L2_2 64,32, 0,0\r
+ KERNEL4x8_I1_L2_2 64,32, 1,0\r
+ KERNEL4x8_I1_L2_2 64,32, 2,0\r
+ KERNEL4x8_I1_L2_2 64,32, 3,0\r
+ KERNEL4x8_I1_L2_2 64,32, 4,0\r
+ KERNEL4x8_I1_L2_2 64,32, 5,0 \r
+ KERNEL4x8_I1_L2_2 64,32, 6,0\r
+ KERNEL4x8_I1_L2_3 64,32, 7,1 \r
+\r
+ b LDGEMM_L4x8_SUB1\r
+ MY_ALIGN\r
+LDGEMM_L4x8_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 15\r
+#else\r
+ andi. L, K, 15\r
+#endif\r
+ KERNEL4x8 1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L4x8_SAVE\r
+ b LDGEMM_L4x8_SUB2\r
+ MY_ALIGN\r
+LDGEMM_L4x8_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 15\r
+#else\r
+ andi. L, K, 15\r
+#endif \r
+ ble LDGEMM_L4x8_SAVE\r
+ MY_ALIGN\r
+LDGEMM_L4x8_SUB2:\r
+\r
+ andi. T1,L, 8\r
+ ble LDGEMM_L4x8_SUB2_4\r
+ LOAD4x8_0\r
+ KERNEL4x8_I1_L2_2 64,32, 0,0\r
+ KERNEL4x8_I1_L2_2 64,32, 1,0\r
+ KERNEL4x8_I1_L2_2 64,32, 2,0\r
+ KERNEL4x8_I1_L2_3 64,32, 3,1\r
+ MY_ALIGN\r
+LDGEMM_L4x8_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble LDGEMM_L4x8_SUB2_2 \r
+ LOAD4x8_0\r
+ KERNEL4x8_I1_L2_2 64,32, 0,0\r
+ KERNEL4x8_I1_L2_3 64,32, 1,1\r
+ MY_ALIGN \r
+LDGEMM_L4x8_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble LDGEMM_L4x8_SUB2_1\r
+ LOAD4x8_0\r
+ KERNEL4x8_I1_L2_3 64,32, 0,1\r
+ MY_ALIGN\r
+LDGEMM_L4x8_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble LDGEMM_L4x8_SAVE \r
+ KERNEL4x8 0\r
+ \r
+ MY_ALIGN\r
+LDGEMM_L4x8_SAVE:\r
+ SAVE4x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4\r
+#endif \r
+LDGEMM_L4x8_END:\r
+\r
+LDGEMM_L4x4_BEGIN:\r
+\r
+\r
+ andi. T1, M, 4\r
+ ble LDGEMM_L4x4_END\r
+\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,4,4\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L4x4_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L4x4_SUB4\r
+\r
+LDGEMM_L4x4_LOOP_START:\r
+\r
+ #dcbt AO, PRE\r
+ LOAD4x4_1\r
+ KERNEL4x4_I1\r
+ KERNEL4x4_2\r
+ KERNEL4x4_1\r
+ #dcbt AO, PRE\r
+ KERNEL4x4_2\r
+\r
+ KERNEL4x4_1\r
+ KERNEL4x4_2\r
+ KERNEL4x4_1\r
+ #dcbt AO, PRE\r
+ KERNEL4x4_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L4x4_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L4x4_LOOP:\r
+\r
+ KERNEL4x4_1\r
+ KERNEL4x4_2\r
+ KERNEL4x4_1\r
+ #dcbt AO, PRE\r
+ KERNEL4x4_2\r
+\r
+ KERNEL4x4_1\r
+ KERNEL4x4_2\r
+ KERNEL4x4_1\r
+ #dcbt AO, PRE\r
+ KERNEL4x4_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L4x4_LOOP\r
+\r
+LDGEMM_L4x4_LOOP_END:\r
+\r
+ KERNEL4x4_1\r
+ KERNEL4x4_2\r
+ KERNEL4x4_1\r
+ KERNEL4x4_2\r
+\r
+ KERNEL4x4_1\r
+ KERNEL4x4_2\r
+ KERNEL4x4_1\r
+ KERNEL4x4_E2\r
+\r
+ b LDGEMM_L4x4_SUB1\r
+\r
+LDGEMM_L4x4_SUB4:\r
+\r
+ KERNEL4x4_SUBI1\r
+ KERNEL4x4_SUB1\r
+ KERNEL4x4_SUB1\r
+ KERNEL4x4_SUB1\r
+\r
+ KERNEL4x4_SUB1\r
+ KERNEL4x4_SUB1\r
+ KERNEL4x4_SUB1\r
+ KERNEL4x4_SUB1\r
+\r
+ b LDGEMM_L4x4_SUB1\r
+\r
+LDGEMM_L4x4_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL4x4_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L4x4_SAVE\r
+ b LDGEMM_L4x4_SUB2\r
+\r
+LDGEMM_L4x4_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L4x4_SAVE\r
+\r
+LDGEMM_L4x4_SUB2:\r
+\r
+ KERNEL4x4_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L4x4_SUB2\r
+\r
+LDGEMM_L4x4_SAVE:\r
+\r
+ SAVE4x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4\r
+#endif \r
+LDGEMM_L4x4_END:\r
+\r
+LDGEMM_L4x2_BEGIN:\r
+\r
+\r
+ andi. T1, M, 2\r
+ ble LDGEMM_L4x2_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,2,4\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L4x2_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L4x2_SUB4\r
+\r
+LDGEMM_L4x2_LOOP_START:\r
+\r
+ LOAD4x2_1\r
+ KERNEL4x2_I1\r
+ KERNEL4x2_2\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L4x2_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L4x2_LOOP:\r
+\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L4x2_LOOP\r
+\r
+LDGEMM_L4x2_LOOP_END:\r
+\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+\r
+ KERNEL4x2_1\r
+ KERNEL4x2_2\r
+ KERNEL4x2_1\r
+ KERNEL4x2_E2\r
+\r
+ b LDGEMM_L4x2_SUB1\r
+\r
+LDGEMM_L4x2_SUB4:\r
+\r
+ KERNEL4x2_SUBI1\r
+ KERNEL4x2_SUB1\r
+ KERNEL4x2_SUB1\r
+ KERNEL4x2_SUB1\r
+\r
+ KERNEL4x2_SUB1\r
+ KERNEL4x2_SUB1\r
+ KERNEL4x2_SUB1\r
+ KERNEL4x2_SUB1\r
+\r
+ b LDGEMM_L4x2_SUB1\r
+\r
+LDGEMM_L4x2_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL4x2_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L4x2_SAVE\r
+ b LDGEMM_L4x2_SUB2\r
+\r
+LDGEMM_L4x2_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L4x2_SAVE\r
+\r
+LDGEMM_L4x2_SUB2:\r
+\r
+ KERNEL4x2_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L4x2_SUB2\r
+\r
+LDGEMM_L4x2_SAVE:\r
+\r
+ SAVE4x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4\r
+#endif \r
+LDGEMM_L4x2_END:\r
+\r
+LDGEMM_L4x1_BEGIN:\r
+\r
+\r
+ andi. T1, M, 1\r
+ ble LDGEMM_L4x1_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,1,4\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L4x1_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L4x1_SUB4\r
+\r
+LDGEMM_L4x1_LOOP_START:\r
+\r
+ LOAD4x1_1\r
+ KERNEL4x1_I1\r
+ KERNEL4x1_2\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L4x1_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L4x1_LOOP:\r
+\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L4x1_LOOP\r
+\r
+LDGEMM_L4x1_LOOP_END:\r
+\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+\r
+ KERNEL4x1_1\r
+ KERNEL4x1_2\r
+ KERNEL4x1_1\r
+ KERNEL4x1_E2\r
+\r
+ b LDGEMM_L4x1_SUB1\r
+\r
+LDGEMM_L4x1_SUB4:\r
+\r
+ KERNEL4x1_SUBI1\r
+ KERNEL4x1_SUB1\r
+ KERNEL4x1_SUB1\r
+ KERNEL4x1_SUB1\r
+\r
+ KERNEL4x1_SUB1\r
+ KERNEL4x1_SUB1\r
+ KERNEL4x1_SUB1\r
+ KERNEL4x1_SUB1\r
+\r
+ b LDGEMM_L4x1_SUB1\r
+\r
+LDGEMM_L4x1_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL4x1_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L4x1_SAVE\r
+ b LDGEMM_L4x1_SUB2\r
+\r
+LDGEMM_L4x1_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L4x1_SAVE\r
+\r
+LDGEMM_L4x1_SUB2:\r
+\r
+ KERNEL4x1_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L4x1_SUB2\r
+\r
+LDGEMM_L4x1_SAVE:\r
+\r
+ SAVE4x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4\r
+#endif \r
+LDGEMM_L4x1_END:\r
+\r
+ slwi T1, K, 5\r
+ add B, B, T1\r
+#if defined(TRMMKERNEL) && !defined(LEFT)\r
+ addi TEMP_REG, TEMP_REG, 4\r
+#endif\r
+ addic. J, J, -1\r
+ bgt LDGEMM_L4_BEGIN\r
+\r
+ andi. T2, N, 3\r
+ ble .L999\r
+\r
+LDGEMM_L4_END:\r
+\r
+ b LDGEMM_L2_BEGIN\r
+\r
+.L999_H1:\r
+\r
+ b .L999\r
+\r
+LDGEMM_L2_BEGIN:\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ andi. T1, N, 2\r
+ ble LDGEMM_L2_END\r
+ mr CO, C\r
+ mr AO, A\r
+ slwi T1, LDC , 1\r
+ add C, C, T1\r
+ srawi. I, M, 4\r
+ ble LDGEMM_L2x16_END\r
+\r
+LDGEMM_L2x16_BEGIN:\r
+\r
+\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,16,2\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L2x16_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L2x16_SUB4\r
+\r
+LDGEMM_L2x16_LOOP_START:\r
+\r
+ #dcbt AO, PRE\r
+ LOAD2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_I1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L2x16_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L2x16_LOOP:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x16_LOOP\r
+\r
+LDGEMM_L2x16_LOOP_END:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_1\r
+ KERNEL2x16_E2\r
+\r
+ b LDGEMM_L2x16_SUB1\r
+\r
+LDGEMM_L2x16_SUB4:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_SUBI1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_SUB1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_SUB1\r
+ #dcbt AO, PRE\r
+ KERNEL2x16_SUB1\r
+\r
+ KERNEL2x16_SUB1\r
+ KERNEL2x16_SUB1\r
+ KERNEL2x16_SUB1\r
+ KERNEL2x16_SUB1\r
+\r
+ b LDGEMM_L2x16_SUB1\r
+\r
+LDGEMM_L2x16_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL2x16_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L2x16_SAVE\r
+ b LDGEMM_L2x16_SUB2\r
+\r
+LDGEMM_L2x16_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L2x16_SAVE\r
+\r
+LDGEMM_L2x16_SUB2:\r
+\r
+ KERNEL2x16_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x16_SUB2\r
+\r
+LDGEMM_L2x16_SAVE:\r
+\r
+ SAVE2x16\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2\r
+#endif \r
+ addic. I, I, -1\r
+ bgt LDGEMM_L2x16_BEGIN\r
+\r
+LDGEMM_L2x16_END:\r
+\r
+LDGEMM_L2x8_BEGIN:\r
+\r
+ andi. T2, M, 15\r
+ ble LDGEMM_L2x1_END\r
+\r
+ andi. T1, M, 8\r
+ ble LDGEMM_L2x8_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,8,2\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L2x8_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L2x8_SUB4\r
+\r
+LDGEMM_L2x8_LOOP_START:\r
+\r
+ #dcbt AO, PRE\r
+ LOAD2x8_1\r
+ KERNEL2x8_I1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L2x8_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L2x8_LOOP:\r
+\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+ KERNEL2x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL2x8_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x8_LOOP\r
+\r
+LDGEMM_L2x8_LOOP_END:\r
+\r
+ KERNEL2x8_1\r
+ KERNEL2x8_2\r
+ KERNEL2x8_1\r
+ KERNEL2x8_2\r
+\r
+ KERNEL2x8_1\r
+ KERNEL2x8_2\r
+ KERNEL2x8_1\r
+ KERNEL2x8_E2\r
+\r
+ b LDGEMM_L2x8_SUB1\r
+\r
+LDGEMM_L2x8_SUB4:\r
+\r
+ KERNEL2x8_SUBI1\r
+ KERNEL2x8_SUB1\r
+ KERNEL2x8_SUB1\r
+ KERNEL2x8_SUB1\r
+\r
+ KERNEL2x8_SUB1\r
+ KERNEL2x8_SUB1\r
+ KERNEL2x8_SUB1\r
+ KERNEL2x8_SUB1\r
+\r
+ b LDGEMM_L2x8_SUB1\r
+\r
+LDGEMM_L2x8_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL2x8_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L2x8_SAVE\r
+ b LDGEMM_L2x8_SUB2\r
+\r
+LDGEMM_L2x8_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L2x8_SAVE\r
+\r
+LDGEMM_L2x8_SUB2:\r
+\r
+ KERNEL2x8_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x8_SUB2\r
+\r
+LDGEMM_L2x8_SAVE:\r
+\r
+ SAVE2x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2\r
+#endif\r
+LDGEMM_L2x8_END:\r
+\r
+LDGEMM_L2x4_BEGIN:\r
+\r
+\r
+ andi. T1, M, 4\r
+ ble LDGEMM_L2x4_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,4,2\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L2x4_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L2x4_SUB4\r
+\r
+LDGEMM_L2x4_LOOP_START:\r
+\r
+ LOAD2x4_1\r
+ KERNEL2x4_I1\r
+ KERNEL2x4_2\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L2x4_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L2x4_LOOP:\r
+\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x4_LOOP\r
+\r
+LDGEMM_L2x4_LOOP_END:\r
+\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+\r
+ KERNEL2x4_1\r
+ KERNEL2x4_2\r
+ KERNEL2x4_1\r
+ KERNEL2x4_E2\r
+\r
+ b LDGEMM_L2x4_SUB1\r
+\r
+LDGEMM_L2x4_SUB4:\r
+\r
+ KERNEL2x4_SUBI1\r
+ KERNEL2x4_SUB1\r
+ KERNEL2x4_SUB1\r
+ KERNEL2x4_SUB1\r
+\r
+ KERNEL2x4_SUB1\r
+ KERNEL2x4_SUB1\r
+ KERNEL2x4_SUB1\r
+ KERNEL2x4_SUB1\r
+\r
+ b LDGEMM_L2x4_SUB1\r
+\r
+LDGEMM_L2x4_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL2x4_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L2x4_SAVE\r
+ b LDGEMM_L2x4_SUB2\r
+\r
+LDGEMM_L2x4_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L2x4_SAVE\r
+\r
+LDGEMM_L2x4_SUB2:\r
+\r
+ KERNEL2x4_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x4_SUB2\r
+\r
+LDGEMM_L2x4_SAVE:\r
+\r
+ SAVE2x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2\r
+#endif\r
+LDGEMM_L2x4_END:\r
+\r
+LDGEMM_L2x2_BEGIN:\r
+\r
+\r
+ andi. T1, M, 2\r
+ ble LDGEMM_L2x2_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,2,2\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L2x2_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L2x2_SUB4\r
+\r
+LDGEMM_L2x2_LOOP_START:\r
+\r
+ LOAD2x2_1\r
+ KERNEL2x2_I1\r
+ KERNEL2x2_2\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L2x2_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L2x2_LOOP:\r
+\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x2_LOOP\r
+\r
+LDGEMM_L2x2_LOOP_END:\r
+\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+\r
+ KERNEL2x2_1\r
+ KERNEL2x2_2\r
+ KERNEL2x2_1\r
+ KERNEL2x2_E2\r
+\r
+ b LDGEMM_L2x2_SUB1\r
+\r
+LDGEMM_L2x2_SUB4:\r
+\r
+ KERNEL2x2_SUBI1\r
+ KERNEL2x2_SUB1\r
+ KERNEL2x2_SUB1\r
+ KERNEL2x2_SUB1\r
+\r
+ KERNEL2x2_SUB1\r
+ KERNEL2x2_SUB1\r
+ KERNEL2x2_SUB1\r
+ KERNEL2x2_SUB1\r
+\r
+ b LDGEMM_L2x2_SUB1\r
+\r
+LDGEMM_L2x2_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL2x2_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L2x2_SAVE\r
+ b LDGEMM_L2x2_SUB2\r
+\r
+LDGEMM_L2x2_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L2x2_SAVE\r
+\r
+LDGEMM_L2x2_SUB2:\r
+\r
+ KERNEL2x2_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x2_SUB2\r
+\r
+LDGEMM_L2x2_SAVE:\r
+\r
+ SAVE2x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2\r
+#endif\r
+LDGEMM_L2x2_END:\r
+\r
+LDGEMM_L2x1_BEGIN:\r
+\r
+\r
+ andi. T1, M, 1\r
+ ble LDGEMM_L2x1_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,1,2\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L2x1_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L2x1_SUB4\r
+\r
+LDGEMM_L2x1_LOOP_START:\r
+\r
+ LOAD2x1_1\r
+ KERNEL2x1_I1\r
+ KERNEL2x1_2\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L2x1_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L2x1_LOOP:\r
+\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x1_LOOP\r
+\r
+LDGEMM_L2x1_LOOP_END:\r
+\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+\r
+ KERNEL2x1_1\r
+ KERNEL2x1_2\r
+ KERNEL2x1_1\r
+ KERNEL2x1_E2\r
+\r
+ b LDGEMM_L2x1_SUB1\r
+\r
+LDGEMM_L2x1_SUB4:\r
+\r
+ KERNEL2x1_SUBI1\r
+ KERNEL2x1_SUB1\r
+ KERNEL2x1_SUB1\r
+ KERNEL2x1_SUB1\r
+\r
+ KERNEL2x1_SUB1\r
+ KERNEL2x1_SUB1\r
+ KERNEL2x1_SUB1\r
+ KERNEL2x1_SUB1\r
+\r
+ b LDGEMM_L2x1_SUB1\r
+\r
+LDGEMM_L2x1_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL2x1_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L2x1_SAVE\r
+ b LDGEMM_L2x1_SUB2\r
+\r
+LDGEMM_L2x1_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L2x1_SAVE\r
+\r
+LDGEMM_L2x1_SUB2:\r
+\r
+ KERNEL2x1_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L2x1_SUB2\r
+\r
+LDGEMM_L2x1_SAVE:\r
+\r
+ SAVE2x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2\r
+#endif\r
+LDGEMM_L2x1_END:\r
+\r
+ slwi T1, K, 4\r
+ add B, B, T1\r
+#if defined(TRMMKERNEL) && !defined(LEFT)\r
+ addi TEMP_REG, TEMP_REG, 2\r
+#endif\r
+LDGEMM_L2_END:\r
+LDGEMM_L1_BEGIN:\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ andi. T1, N, 1\r
+ ble LDGEMM_L1_END\r
+ mr CO, C\r
+ mr AO, A\r
+ srawi. I, M, 4\r
+ ble LDGEMM_L1x16_END\r
+\r
+LDGEMM_L1x16_BEGIN:\r
+\r
+\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,16,1\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L1x16_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L1x16_SUB4\r
+\r
+LDGEMM_L1x16_LOOP_START:\r
+\r
+ #dcbt AO, PRE\r
+ LOAD1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_I1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L1x16_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L1x16_LOOP:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x16_LOOP\r
+\r
+LDGEMM_L1x16_LOOP_END:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_2\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_1\r
+ KERNEL1x16_E2\r
+\r
+ b LDGEMM_L1x16_SUB1\r
+\r
+LDGEMM_L1x16_SUB4:\r
+\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_SUBI1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_SUB1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_SUB1\r
+ #dcbt AO, PRE\r
+ KERNEL1x16_SUB1\r
+\r
+ KERNEL1x16_SUB1\r
+ KERNEL1x16_SUB1\r
+ KERNEL1x16_SUB1\r
+ KERNEL1x16_SUB1\r
+\r
+ b LDGEMM_L1x16_SUB1\r
+\r
+LDGEMM_L1x16_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL1x16_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L1x16_SAVE\r
+ b LDGEMM_L1x16_SUB2\r
+\r
+LDGEMM_L1x16_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L1x16_SAVE\r
+\r
+LDGEMM_L1x16_SUB2:\r
+\r
+ KERNEL1x16_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x16_SUB2\r
+\r
+LDGEMM_L1x16_SAVE:\r
+\r
+ SAVE1x16\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1\r
+#endif\r
+ addic. I, I, -1\r
+ bgt LDGEMM_L1x16_BEGIN\r
+\r
+LDGEMM_L1x16_END:\r
+\r
+LDGEMM_L1x8_BEGIN:\r
+\r
+ andi. T2, M, 15\r
+ ble LDGEMM_L1x1_END\r
+\r
+ andi. T1, M, 8\r
+ ble LDGEMM_L1x8_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,8,1\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L1x8_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L1x8_SUB4\r
+\r
+LDGEMM_L1x8_LOOP_START:\r
+\r
+ #dcbt AO, PRE\r
+ LOAD1x8_1\r
+ KERNEL1x8_I1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L1x8_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L1x8_LOOP:\r
+\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+ KERNEL1x8_1\r
+ #dcbt AO, PRE\r
+ KERNEL1x8_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x8_LOOP\r
+\r
+LDGEMM_L1x8_LOOP_END:\r
+\r
+ KERNEL1x8_1\r
+ KERNEL1x8_2\r
+ KERNEL1x8_1\r
+ KERNEL1x8_2\r
+\r
+ KERNEL1x8_1\r
+ KERNEL1x8_2\r
+ KERNEL1x8_1\r
+ KERNEL1x8_E2\r
+\r
+ b LDGEMM_L1x8_SUB1\r
+\r
+LDGEMM_L1x8_SUB4:\r
+\r
+ KERNEL1x8_SUBI1\r
+ KERNEL1x8_SUB1\r
+ KERNEL1x8_SUB1\r
+ KERNEL1x8_SUB1\r
+\r
+ KERNEL1x8_SUB1\r
+ KERNEL1x8_SUB1\r
+ KERNEL1x8_SUB1\r
+ KERNEL1x8_SUB1\r
+\r
+ b LDGEMM_L1x8_SUB1\r
+\r
+LDGEMM_L1x8_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL1x8_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L1x8_SAVE\r
+ b LDGEMM_L1x8_SUB2\r
+\r
+LDGEMM_L1x8_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L1x8_SAVE\r
+\r
+LDGEMM_L1x8_SUB2:\r
+\r
+ KERNEL1x8_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x8_SUB2\r
+\r
+LDGEMM_L1x8_SAVE:\r
+\r
+ SAVE1x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1\r
+#endif\r
+LDGEMM_L1x8_END:\r
+\r
+LDGEMM_L1x4_BEGIN:\r
+\r
+\r
+ andi. T1, M, 4\r
+ ble LDGEMM_L1x4_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,4,1\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L1x4_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L1x4_SUB4\r
+\r
+LDGEMM_L1x4_LOOP_START:\r
+\r
+ LOAD1x4_1\r
+ KERNEL1x4_I1\r
+ KERNEL1x4_2\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L1x4_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L1x4_LOOP:\r
+\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x4_LOOP\r
+\r
+LDGEMM_L1x4_LOOP_END:\r
+\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+\r
+ KERNEL1x4_1\r
+ KERNEL1x4_2\r
+ KERNEL1x4_1\r
+ KERNEL1x4_E2\r
+\r
+ b LDGEMM_L1x4_SUB1\r
+\r
+LDGEMM_L1x4_SUB4:\r
+\r
+ KERNEL1x4_SUBI1\r
+ KERNEL1x4_SUB1\r
+ KERNEL1x4_SUB1\r
+ KERNEL1x4_SUB1\r
+\r
+ KERNEL1x4_SUB1\r
+ KERNEL1x4_SUB1\r
+ KERNEL1x4_SUB1\r
+ KERNEL1x4_SUB1\r
+\r
+ b LDGEMM_L1x4_SUB1\r
+\r
+LDGEMM_L1x4_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL1x4_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L1x4_SAVE\r
+ b LDGEMM_L1x4_SUB2\r
+\r
+LDGEMM_L1x4_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L1x4_SAVE\r
+\r
+LDGEMM_L1x4_SUB2:\r
+\r
+ KERNEL1x4_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x4_SUB2\r
+\r
+LDGEMM_L1x4_SAVE:\r
+\r
+ SAVE1x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1\r
+#endif\r
+LDGEMM_L1x4_END:\r
+\r
+LDGEMM_L1x2_BEGIN:\r
+\r
+\r
+ andi. T1, M, 2\r
+ ble LDGEMM_L1x2_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,2,1\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L1x2_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L1x2_SUB4\r
+\r
+LDGEMM_L1x2_LOOP_START:\r
+\r
+ LOAD1x2_1\r
+ KERNEL1x2_I1\r
+ KERNEL1x2_2\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L1x2_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L1x2_LOOP:\r
+\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x2_LOOP\r
+\r
+LDGEMM_L1x2_LOOP_END:\r
+\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+\r
+ KERNEL1x2_1\r
+ KERNEL1x2_2\r
+ KERNEL1x2_1\r
+ KERNEL1x2_E2\r
+\r
+ b LDGEMM_L1x2_SUB1\r
+\r
+LDGEMM_L1x2_SUB4:\r
+\r
+ KERNEL1x2_SUBI1\r
+ KERNEL1x2_SUB1\r
+ KERNEL1x2_SUB1\r
+ KERNEL1x2_SUB1\r
+\r
+ KERNEL1x2_SUB1\r
+ KERNEL1x2_SUB1\r
+ KERNEL1x2_SUB1\r
+ KERNEL1x2_SUB1\r
+\r
+ b LDGEMM_L1x2_SUB1\r
+\r
+LDGEMM_L1x2_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL1x2_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L1x2_SAVE\r
+ b LDGEMM_L1x2_SUB2\r
+\r
+LDGEMM_L1x2_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L1x2_SAVE\r
+\r
+LDGEMM_L1x2_SUB2:\r
+\r
+ KERNEL1x2_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x2_SUB2\r
+\r
+LDGEMM_L1x2_SAVE:\r
+\r
+ SAVE1x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1\r
+#endif\r
+LDGEMM_L1x2_END:\r
+\r
+LDGEMM_L1x1_BEGIN:\r
+\r
+\r
+ andi. T1, M, 1\r
+ ble LDGEMM_L1x1_END\r
+#if defined(TRMMKERNEL)\r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1\r
+ REFRESH_TEMP_BK T3,K,TEMP_REG,1,1\r
+ srawi. L, T3, 3 \r
+#else\r
+ mr BO, B\r
+ srawi. L, K, 3 \r
+#endif \r
+ ble LDGEMM_L1x1_SUB0\r
+ cmpwi cr0, L, 1\r
+ ble LDGEMM_L1x1_SUB4\r
+\r
+LDGEMM_L1x1_LOOP_START:\r
+\r
+ LOAD1x1_1\r
+ KERNEL1x1_I1\r
+ KERNEL1x1_2\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+\r
+ addic. L, L, -2\r
+ ble LDGEMM_L1x1_LOOP_END\r
+\r
+ MY_ALIGN\r
+\r
+LDGEMM_L1x1_LOOP:\r
+\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x1_LOOP\r
+\r
+LDGEMM_L1x1_LOOP_END:\r
+\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+\r
+ KERNEL1x1_1\r
+ KERNEL1x1_2\r
+ KERNEL1x1_1\r
+ KERNEL1x1_E2\r
+\r
+ b LDGEMM_L1x1_SUB1\r
+\r
+LDGEMM_L1x1_SUB4:\r
+\r
+ KERNEL1x1_SUBI1\r
+ KERNEL1x1_SUB1\r
+ KERNEL1x1_SUB1\r
+ KERNEL1x1_SUB1\r
+\r
+ KERNEL1x1_SUB1\r
+ KERNEL1x1_SUB1\r
+ KERNEL1x1_SUB1\r
+ KERNEL1x1_SUB1\r
+\r
+ b LDGEMM_L1x1_SUB1\r
+\r
+LDGEMM_L1x1_SUB0:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+\r
+ KERNEL1x1_SUBI1\r
+\r
+ addic. L, L, -1\r
+ ble LDGEMM_L1x1_SAVE\r
+ b LDGEMM_L1x1_SUB2\r
+\r
+LDGEMM_L1x1_SUB1:\r
+#if defined(TRMMKERNEL)\r
+ andi. L, T3, 7\r
+#else\r
+ andi. L, K, 7\r
+#endif\r
+ ble LDGEMM_L1x1_SAVE\r
+\r
+LDGEMM_L1x1_SUB2:\r
+\r
+ KERNEL1x1_SUB1\r
+\r
+ addic. L, L, -1\r
+ bgt LDGEMM_L1x1_SUB2\r
+\r
+LDGEMM_L1x1_SAVE:\r
+\r
+ SAVE1x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1\r
+#endif\r
+LDGEMM_L1x1_END:\r
+#if defined(TRMMKERNEL) && !defined(LEFT)\r
+ addi TEMP_REG, TEMP_REG, 1\r
+#endif\r
+LDGEMM_L1_END:\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+\r
+/**************************************************************************************\r
+* Abdelrauf(quickwritereader@googlemail.com)\r
+* BLASTEST : OK\r
+* CTEST : OK\r
+* TEST : OK\r
+* LAPACK-TEST : OK\r
+**************************************************************************************/\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=16 *\r
+*********************************************************************/\r
+.macro LOAD4x16_1\r
+ LOAD4x16 1\r
+.endm\r
+\r
+.macro LOAD4x16_0\r
+ LOAD4x16 0\r
+.endm\r
+.macro LOAD4x16 Zero\r
+\r
+ lxv vs24, 0(BO)\r
+ lxv vs26, 16(BO)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+\r
+ lxv vs0, 0(AO)\r
+ lxv vs1, 16(AO)\r
+ lxv vs2, 32(AO)\r
+ lxv vs3, 48(AO)\r
+ \r
+\r
+ lxv vs4, 64(AO)\r
+ lxv vs5, 80(AO)\r
+ lxv vs6, 96(AO)\r
+ lxv vs7, 112(AO)\r
+.if \Zero==1 \r
+ xxlxor vs32,vs32,vs32\r
+ xxlxor vs33,vs33,vs33\r
+ xxlxor vs34,vs34,vs34\r
+ xxlxor vs35,vs35,vs35\r
+ xxlxor vs36,vs36,vs36\r
+ xxlxor vs37,vs37,vs37\r
+ xxlxor vs38,vs38,vs38\r
+ xxlxor vs39,vs39,vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
+ xxlxor vs48, vs48, vs48\r
+ xxlxor vs49, vs49, vs49\r
+ xxlxor vs50, vs50, vs50\r
+ xxlxor vs51, vs51, vs51 \r
+ xxlxor vs52, vs52, vs52\r
+ xxlxor vs53, vs53, vs53\r
+ xxlxor vs54, vs54, vs54\r
+ xxlxor vs55, vs55, vs55 \r
+ xxlxor vs56, vs56, vs56\r
+ xxlxor vs57, vs57, vs57\r
+ xxlxor vs58, vs58, vs58\r
+ xxlxor vs59, vs59, vs59 \r
+ xxlxor vs60, vs60, vs60\r
+ xxlxor vs61, vs61, vs61\r
+ xxlxor vs62, vs62, vs62\r
+ xxlxor vs63, vs63, vs63 \r
+.endif\r
+.endm\r
+\r
+ \r
+#define unit_size 8\r
+#define DISP32(ind,disp) (ind*unit_size*32+disp)\r
+#define DISP16(ind,disp) (ind*unit_size*16+disp)\r
+#define DISP8(ind,disp) (ind*unit_size*8+disp)\r
+#define DISP4(ind,disp) (ind*unit_size*4+disp)\r
+#define DISP2(ind,disp) (ind*unit_size*2+disp)\r
+#define DISP1(ind,disp) (ind*unit_size+disp)\r
+\r
+.macro KERNEL4x16_L1_L2 Index,IsLast\r
+ KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0\r
+.endm\r
+\r
+\r
+\r
+.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0\r
+.endm\r
+\r
+.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0\r
+.endm\r
+\r
+.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1\r
+.endm\r
+\r
+.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0\r
+.endm\r
+\r
+.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0\r
+.endm\r
+\r
+.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1\r
+.endm\r
+\r
+.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+.if \First ==1\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+.else\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+.endif\r
+ lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG)\r
+ lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG)\r
+.if \First ==1\r
+ xvmuldp vs36, vs4, vs24\r
+ xvmuldp vs37, vs5, vs24\r
+ xvmuldp vs38, vs6, vs24\r
+ xvmuldp vs39, vs7, vs24\r
+.else\r
+ xvmaddadp vs36, vs4, vs24\r
+ xvmaddadp vs37, vs5, vs24\r
+ xvmaddadp vs38, vs6, vs24\r
+ xvmaddadp vs39, vs7, vs24\r
+.endif\r
+ lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG)\r
+ lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG)\r
+ xxpermdi vs29, vs28, vs28,2 \r
+ xxpermdi vs31, vs30, vs30,2\r
+.if \First ==1\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+\r
+\r
+ xvmuldp vs44, vs4, vs25\r
+ xvmuldp vs45, vs5, vs25\r
+ xvmuldp vs46, vs6, vs25\r
+ xvmuldp vs47, vs7, vs25\r
+\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+ xvmuldp vs49, vs1, vs26\r
+ xvmuldp vs50, vs2, vs26\r
+ xvmuldp vs51, vs3, vs26\r
+\r
+\r
+.else\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+\r
+\r
+ xvmaddadp vs44, vs4, vs25\r
+ xvmaddadp vs45, vs5, vs25\r
+ xvmaddadp vs46, vs6, vs25\r
+ xvmaddadp vs47, vs7, vs25\r
+\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+ xvmaddadp vs49, vs1, vs26\r
+ xvmaddadp vs50, vs2, vs26\r
+ xvmaddadp vs51, vs3, vs26\r
+\r
+.endif\r
+ lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG)\r
+ lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG)\r
+.if \First ==1\r
+ xvmuldp vs52, vs4, vs26\r
+ xvmuldp vs53, vs5, vs26\r
+ xvmuldp vs54, vs6, vs26\r
+ xvmuldp vs55, vs7, vs26\r
+\r
+.else\r
+ xvmaddadp vs52, vs4, vs26\r
+ xvmaddadp vs53, vs5, vs26\r
+ xvmaddadp vs54, vs6, vs26\r
+ xvmaddadp vs55, vs7, vs26\r
+.endif\r
+ lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG)\r
+ lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG)\r
+.if \First ==1\r
+ xvmuldp vs56, vs0, vs27\r
+ xvmuldp vs57, vs1, vs27\r
+ xvmuldp vs58, vs2, vs27\r
+ xvmuldp vs59, vs3, vs27\r
+\r
+ \r
+\r
+ xvmuldp vs60, vs4, vs27\r
+ xvmuldp vs61, vs5, vs27\r
+ xvmuldp vs62, vs6, vs27\r
+ xvmuldp vs63, vs7, vs27\r
+\r
+.else\r
+ xvmaddadp vs56, vs0, vs27\r
+ xvmaddadp vs57, vs1, vs27\r
+ xvmaddadp vs58, vs2, vs27\r
+ xvmaddadp vs59, vs3, vs27\r
+\r
+ \r
+\r
+ xvmaddadp vs60, vs4, vs27\r
+ xvmaddadp vs61, vs5, vs27\r
+ xvmaddadp vs62, vs6, vs27\r
+ xvmaddadp vs63, vs7, vs27\r
+.endif\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+.if \Complete==0\r
+ lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG)\r
+ lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG)\r
+.endif\r
+ xvmaddadp vs36, vs12, vs28\r
+ xvmaddadp vs37, vs13, vs28\r
+ xvmaddadp vs38, vs14, vs28\r
+ xvmaddadp vs39, vs15, vs28\r
+.if \Complete==0\r
+ lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG)\r
+ lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+.endif\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+ xvmaddadp vs42, vs10, vs29\r
+ xvmaddadp vs43, vs11, vs29\r
+.if \Complete==0\r
+ lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG)\r
+ lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG)\r
+.endif\r
+ xvmaddadp vs44, vs12, vs29\r
+ xvmaddadp vs45, vs13, vs29\r
+ xvmaddadp vs46, vs14, vs29\r
+ xvmaddadp vs47, vs15, vs29\r
+\r
+\r
+ xvmaddadp vs48, vs8, vs30\r
+ xvmaddadp vs49, vs9, vs30\r
+ xvmaddadp vs50, vs10, vs30\r
+ xvmaddadp vs51, vs11, vs30\r
+.if \Complete==0\r
+ lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG)\r
+ lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG)\r
+.endif\r
+ xvmaddadp vs52, vs12, vs30\r
+ xvmaddadp vs53, vs13, vs30\r
+ xvmaddadp vs54, vs14, vs30\r
+ xvmaddadp vs55, vs15, vs30\r
+.if \Complete==0\r
+ lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG)\r
+ lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG)\r
+.endif\r
+ xvmaddadp vs56, vs8, vs31\r
+ xvmaddadp vs57, vs9, vs31\r
+ xvmaddadp vs58, vs10, vs31\r
+ xvmaddadp vs59, vs11, vs31\r
+ \r
+\r
+ xvmaddadp vs60, vs12, vs31\r
+ \r
+ xvmaddadp vs61, vs13, vs31\r
+ xvmaddadp vs62, vs14, vs31\r
+ \r
+ xvmaddadp vs63, vs15, vs31\r
+ .if \IsLast==1 \r
+ .if \Complete==1\r
+ addi \AREG, \AREG, DISP32(\Index,128+\OffsetA)\r
+ addi \BREG, \BREG, DISP8(\Index,32+\OffsetB)\r
+ .else\r
+ addi \AREG, \AREG, DISP32(\Index,256)\r
+ addi \BREG, \BREG, DISP8(\Index,64)\r
+ .endif\r
+ .endif\r
+ \r
+\r
+.endm\r
+\r
+ \r
+\r
+.macro KERNEL4x16 First\r
+\r
+ lxv vs24, 0(BO)\r
+ lxv vs26, 16(BO)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+\r
+ lxv vs0, 0(AO)\r
+ lxv vs1, 16(AO)\r
+ lxv vs2, 32(AO)\r
+ lxv vs3, 48(AO) \r
+\r
+ lxv vs4, 64(AO)\r
+ lxv vs5, 80(AO)\r
+ lxv vs6, 96(AO)\r
+ lxv vs7, 112(AO)\r
+\r
+\r
+ \r
+ addi BO, BO, 32\r
+ addi AO, AO, 128\r
+\r
+.if \First==1\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+ xvmuldp vs36, vs4, vs24\r
+ xvmuldp vs37, vs5, vs24\r
+ xvmuldp vs38, vs6, vs24\r
+ xvmuldp vs39, vs7, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+ xvmuldp vs44, vs4, vs25\r
+ xvmuldp vs45, vs5, vs25\r
+ xvmuldp vs46, vs6, vs25\r
+ xvmuldp vs47, vs7, vs25\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+ xvmuldp vs49, vs1, vs26\r
+ xvmuldp vs50, vs2, vs26\r
+ xvmuldp vs51, vs3, vs26\r
+ xvmuldp vs52, vs4, vs26\r
+ xvmuldp vs53, vs5, vs26\r
+ xvmuldp vs54, vs6, vs26\r
+ xvmuldp vs55, vs7, vs26\r
+\r
+ xvmuldp vs56, vs0, vs27\r
+ xvmuldp vs57, vs1, vs27\r
+ xvmuldp vs58, vs2, vs27\r
+ xvmuldp vs59, vs3, vs27\r
+ xvmuldp vs60, vs4, vs27\r
+ xvmuldp vs61, vs5, vs27\r
+ xvmuldp vs62, vs6, vs27\r
+ xvmuldp vs63, vs7, vs27\r
+.else\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+ xvmaddadp vs36, vs4, vs24\r
+ xvmaddadp vs37, vs5, vs24\r
+ xvmaddadp vs38, vs6, vs24\r
+ xvmaddadp vs39, vs7, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+ \r
+ xvmaddadp vs44, vs4, vs25\r
+ xvmaddadp vs45, vs5, vs25\r
+ xvmaddadp vs46, vs6, vs25\r
+ xvmaddadp vs47, vs7, vs25\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+ xvmaddadp vs49, vs1, vs26\r
+ xvmaddadp vs50, vs2, vs26\r
+ xvmaddadp vs51, vs3, vs26\r
+ \r
+ xvmaddadp vs52, vs4, vs26\r
+ xvmaddadp vs53, vs5, vs26\r
+ xvmaddadp vs54, vs6, vs26\r
+ xvmaddadp vs55, vs7, vs26\r
+\r
+ xvmaddadp vs56, vs0, vs27\r
+ xvmaddadp vs57, vs1, vs27\r
+ xvmaddadp vs58, vs2, vs27\r
+ xvmaddadp vs59, vs3, vs27\r
+ xvmaddadp vs60, vs4, vs27\r
+ xvmaddadp vs61, vs5, vs27\r
+ xvmaddadp vs62, vs6, vs27\r
+ xvmaddadp vs63, vs7, vs27\r
+\r
+.endif\r
+.endm\r
+\r
+.macro SAVE4x16_REGS\r
+ add C2, CO, LDC\r
+ add C3, C2, LDC\r
+ add C4, C3, LDC\r
+.endm\r
+\r
+.macro SAVE4x16\r
+#ifndef TRMMKERNEL\r
+ lxv vs0, 0(CO)\r
+ lxv vs2, 16(CO)\r
+ lxv vs4, 32(CO)\r
+ lxv vs6, 48(CO)\r
+#endif \r
+ xxpermdi vs8, vs40,vs32,1\r
+ xxpermdi vs9 ,vs32,vs40,1\r
+#ifndef TRMMKERNEL\r
+ lxv vs24, 64(CO)\r
+ lxv vs26, 80(CO)\r
+ lxv vs28, 96(CO)\r
+ lxv vs30, 112(CO)\r
+#endif \r
+ xxpermdi vs10, vs41,vs33,1 \r
+ xxpermdi vs11 ,vs33,vs41,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs1, 0(C2)\r
+ lxv vs3, 16(C2)\r
+ lxv vs5, 32(C2)\r
+ lxv vs7, 48(C2)\r
+#endif \r
+ xxpermdi vs12, vs42,vs34,1\r
+ xxpermdi vs13 ,vs34,vs42,1\r
+#ifndef TRMMKERNEL\r
+ lxv vs25, 64(C2)\r
+ lxv vs27, 80(C2)\r
+#endif \r
+ xxpermdi vs14, vs43,vs35,1 \r
+ xxpermdi vs15 ,vs35,vs43,1 \r
+#ifndef TRMMKERNEL \r
+ lxv vs29, 96(C2)\r
+ lxv vs31, 112(C2) \r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs8, alpha_r \r
+ xvmaddadp vs1, vs9, alpha_r \r
+ xvmaddadp vs2, vs10, alpha_r \r
+ xvmaddadp vs3, vs11, alpha_r \r
+#else\r
+ xvmuldp vs0, vs8, alpha_r \r
+ xvmuldp vs1, vs9, alpha_r \r
+ xvmuldp vs2, vs10, alpha_r \r
+ xvmuldp vs3, vs11, alpha_r \r
+\r
+#endif\r
+ xxpermdi vs8, vs44,vs36,1\r
+ xxpermdi vs9 ,vs36,vs44,1\r
+ xxpermdi vs10, vs45,vs37,1 \r
+ xxpermdi vs11 ,vs37,vs45,1\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs4, vs12, alpha_r \r
+ xvmaddadp vs5, vs13, alpha_r \r
+ xvmaddadp vs6, vs14, alpha_r \r
+ xvmaddadp vs7, vs15, alpha_r \r
+#else\r
+ xvmuldp vs4, vs12, alpha_r \r
+ xvmuldp vs5, vs13, alpha_r \r
+ xvmuldp vs6, vs14, alpha_r \r
+ xvmuldp vs7, vs15, alpha_r \r
+#endif\r
+ xxpermdi vs12, vs46,vs38,1\r
+ xxpermdi vs13 ,vs38,vs46,1\r
+ xxpermdi vs14, vs47,vs39,1 \r
+ xxpermdi vs15 ,vs39,vs47,1\r
+\r
+#ifndef TRMMKERNEL \r
+ xvmaddadp vs24, vs8, alpha_r \r
+ xvmaddadp vs25, vs9, alpha_r \r
+ xvmaddadp vs26, vs10, alpha_r \r
+ xvmaddadp vs27, vs11, alpha_r \r
+\r
+ xvmaddadp vs28, vs12, alpha_r \r
+ xvmaddadp vs29, vs13, alpha_r \r
+ xvmaddadp vs30, vs14, alpha_r \r
+ xvmaddadp vs31, vs15, alpha_r \r
+#else\r
+ xvmuldp vs24, vs8, alpha_r \r
+ xvmuldp vs25, vs9, alpha_r \r
+ xvmuldp vs26, vs10, alpha_r \r
+ xvmuldp vs27, vs11, alpha_r \r
+\r
+ xvmuldp vs28, vs12, alpha_r \r
+ xvmuldp vs29, vs13, alpha_r \r
+ xvmuldp vs30, vs14, alpha_r \r
+ xvmuldp vs31, vs15, alpha_r \r
+\r
+#endif\r
+ stxv vs0, 0(CO)\r
+ stxv vs2, 16(CO)\r
+ stxv vs4, 32(CO)\r
+ stxv vs6, 48(CO)\r
+\r
+ stxv vs24, 64(CO)\r
+ stxv vs26, 80(CO)\r
+ stxv vs28, 96(CO)\r
+ stxv vs30, 112(CO)\r
+\r
+ stxv vs1, 0(C2)\r
+ stxv vs3, 16(C2)\r
+ stxv vs5, 32(C2)\r
+ stxv vs7, 48(C2)\r
+ \r
+ stxv vs25, 64(C2)\r
+ stxv vs27, 80(C2)\r
+ stxv vs29, 96(C2)\r
+ stxv vs31, 112(C2) \r
+#ifndef TRMMKERNEL\r
+ lxv vs0, 0(C3)\r
+ lxv vs2, 16(C3)\r
+ lxv vs4, 32(C3)\r
+ lxv vs6, 48(C3)\r
+#endif \r
+ xxpermdi vs8, vs56,vs48,1\r
+ xxpermdi vs9 ,vs48,vs56,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs24, 64(C3)\r
+ lxv vs26, 80(C3)\r
+#endif \r
+ xxpermdi vs10, vs57,vs49,1 \r
+ xxpermdi vs11 ,vs49,vs57,1 \r
+#ifndef TRMMKERNEL \r
+ lxv vs28, 96(C3)\r
+ lxv vs30, 112(C3)\r
+#endif \r
+ xxpermdi vs12, vs58,vs50,1\r
+ xxpermdi vs13 ,vs50,vs58,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs1, 0(C4)\r
+ lxv vs3, 16(C4)\r
+#endif \r
+ xxpermdi vs14, vs59,vs51,1 \r
+ xxpermdi vs15 ,vs51,vs59,1 \r
+#ifndef TRMMKERNEL \r
+ lxv vs5, 32(C4)\r
+ lxv vs7, 48(C4)\r
+\r
+ lxv vs25, 64(C4)\r
+ lxv vs27, 80(C4)\r
+ lxv vs29, 96(C4)\r
+ lxv vs31, 112(C4) \r
+#endif\r
+ \r
+#ifndef TRMMKERNEL \r
+ xvmaddadp vs0, vs8, alpha_r \r
+ xvmaddadp vs1, vs9, alpha_r \r
+ xvmaddadp vs2, vs10, alpha_r \r
+ xvmaddadp vs3, vs11, alpha_r \r
+#else\r
+ xvmuldp vs0, vs8, alpha_r \r
+ xvmuldp vs1, vs9, alpha_r \r
+ xvmuldp vs2, vs10, alpha_r \r
+ xvmuldp vs3, vs11, alpha_r \r
+\r
+#endif\r
+\r
+ xxpermdi vs8, vs60,vs52,1\r
+ xxpermdi vs9 ,vs52,vs60,1\r
+ xxpermdi vs10, vs61,vs53,1 \r
+ xxpermdi vs11 ,vs53,vs61,1\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs4, vs12, alpha_r \r
+ xvmaddadp vs5, vs13, alpha_r \r
+ xvmaddadp vs6, vs14, alpha_r \r
+ xvmaddadp vs7, vs15, alpha_r \r
+#else\r
+ xvmuldp vs4, vs12, alpha_r \r
+ xvmuldp vs5, vs13, alpha_r \r
+ xvmuldp vs6, vs14, alpha_r \r
+ xvmuldp vs7, vs15, alpha_r \r
+#endif\r
+\r
+\r
+ xxpermdi vs12, vs62,vs54,1\r
+ xxpermdi vs13 ,vs54,vs62,1\r
+ xxpermdi vs14, vs63,vs55,1 \r
+ xxpermdi vs15 ,vs55,vs63,1\r
+#ifndef TRMMKERNEL \r
+ xvmaddadp vs24, vs8, alpha_r \r
+ xvmaddadp vs25, vs9, alpha_r \r
+ xvmaddadp vs26, vs10, alpha_r \r
+ xvmaddadp vs27, vs11, alpha_r \r
+\r
+ xvmaddadp vs28, vs12, alpha_r \r
+ xvmaddadp vs29, vs13, alpha_r \r
+ xvmaddadp vs30, vs14, alpha_r \r
+ xvmaddadp vs31, vs15, alpha_r \r
+#else\r
+ xvmuldp vs24, vs8, alpha_r \r
+ xvmuldp vs25, vs9, alpha_r \r
+ xvmuldp vs26, vs10, alpha_r \r
+ xvmuldp vs27, vs11, alpha_r \r
+\r
+ xvmuldp vs28, vs12, alpha_r \r
+ xvmuldp vs29, vs13, alpha_r \r
+ xvmuldp vs30, vs14, alpha_r \r
+ xvmuldp vs31, vs15, alpha_r \r
+#endif\r
+ stxv vs0, 0(C3)\r
+ stxv vs2, 16(C3)\r
+ stxv vs4, 32(C3)\r
+ stxv vs6, 48(C3)\r
+\r
+ stxv vs24, 64(C3)\r
+ stxv vs26, 80(C3)\r
+ stxv vs28, 96(C3)\r
+ stxv vs30, 112(C3)\r
+\r
+ stxv vs1, 0(C4)\r
+ stxv vs3, 16(C4)\r
+ stxv vs5, 32(C4)\r
+ stxv vs7, 48(C4)\r
+ \r
+ stxv vs25, 64(C4)\r
+ stxv vs27, 80(C4)\r
+ stxv vs29, 96(C4)\r
+ stxv vs31, 112(C4) \r
+\r
+ addi CO, CO, 128\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=8 *\r
+*********************************************************************/\r
+\r
+.macro LOAD4x8_1\r
+ LOAD4x8 1\r
+.endm\r
+\r
+.macro LOAD4x8_0\r
+ LOAD4x8 0\r
+.endm\r
+.macro LOAD4x8 Zero\r
+\r
+ lxv vs24, 0(BO)\r
+ lxv vs26, 16(BO)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+\r
+ lxv vs0, 0(AO)\r
+ lxv vs1, 16(AO)\r
+ lxv vs2, 32(AO)\r
+ lxv vs3, 48(AO)\r
+ \r
+\r
+\r
+.if \Zero==1 \r
+ xxlxor vs32,vs32,vs32\r
+ xxlxor vs33,vs33,vs33\r
+ xxlxor vs34,vs34,vs34\r
+ xxlxor vs35,vs35,vs35\r
+\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+\r
+ xxlxor vs48, vs48, vs48\r
+ xxlxor vs49, vs49, vs49\r
+ xxlxor vs50, vs50, vs50\r
+ xxlxor vs51, vs51, vs51 \r
+\r
+ xxlxor vs56, vs56, vs56\r
+ xxlxor vs57, vs57, vs57\r
+ xxlxor vs58, vs58, vs58\r
+ xxlxor vs59, vs59, vs59 \r
+\r
+.endif\r
+.endm\r
+\r
+ \r
+ \r
+.macro KERNEL4x8_L1_L2 Index,IsLast\r
+ KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0\r
+.endm\r
+\r
+\r
+\r
+.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0\r
+.endm\r
+\r
+.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0\r
+.endm\r
+\r
+.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1\r
+.endm\r
+\r
+.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP16(\Index,0+\OffsetA)(AO)\r
+ lxv vs9, DISP16(\Index,16+\OffsetA)(AO)\r
+.if \First ==1\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+.else\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+.endif\r
+\r
+ lxv vs10, DISP16(\Index,32+\OffsetA)(AO)\r
+ lxv vs11, DISP16(\Index,48+\OffsetA)(AO)\r
+\r
+\r
+\r
+.if \First ==1\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+ xvmuldp vs49, vs1, vs26\r
+ xvmuldp vs50, vs2, vs26\r
+ xvmuldp vs51, vs3, vs26\r
+\r
+\r
+.else\r
+\r
+ lxv vs28, DISP8(\Index,0 +\OffsetB)(BO)\r
+ lxv vs30, DISP8(\Index,16 +\OffsetB)(BO)\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+ xvmaddadp vs49, vs1, vs26\r
+ xvmaddadp vs50, vs2, vs26\r
+ xvmaddadp vs51, vs3, vs26\r
+\r
+.endif\r
+ xxpermdi vs29, vs28, vs28,2 \r
+ xxpermdi vs31, vs30, vs30,2\r
+.if \First ==1\r
+ xvmuldp vs56, vs0, vs27\r
+ xvmuldp vs57, vs1, vs27\r
+ xvmuldp vs58, vs2, vs27\r
+ xvmuldp vs59, vs3, vs27\r
+\r
+.else\r
+ xvmaddadp vs56, vs0, vs27\r
+ xvmaddadp vs57, vs1, vs27\r
+ xvmaddadp vs58, vs2, vs27\r
+ xvmaddadp vs59, vs3, vs27\r
+\r
+.endif\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,64+\OffsetA)(AO)\r
+ lxv vs1, DISP16(\Index,80+\OffsetA)(AO) \r
+.endif\r
+\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+ xvmaddadp vs42, vs10, vs29\r
+ xvmaddadp vs43, vs11, vs29\r
+\r
+.if \Complete==0 \r
+ lxv vs2, DISP16(\Index,96+\OffsetA)(AO)\r
+ lxv vs3, DISP16(\Index,112+\OffsetA)(AO)\r
+.endif \r
+\r
+\r
+ xvmaddadp vs48, vs8, vs30\r
+ xvmaddadp vs49, vs9, vs30\r
+ xvmaddadp vs50, vs10, vs30\r
+ xvmaddadp vs51, vs11, vs30\r
+.if \Complete==0\r
+ lxv vs24, DISP8(\Index,32 +\OffsetB)(BO)\r
+ lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) \r
+.endif\r
+ \r
+ xvmaddadp vs56, vs8, vs31\r
+ xvmaddadp vs57, vs9, vs31\r
+ xvmaddadp vs58, vs10, vs31\r
+ xvmaddadp vs59, vs11, vs31\r
+.if \Complete==0 \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+.endif\r
+\r
+ .if \IsLast==1 \r
+ .if \Complete==1\r
+ addi AO, AO, DISP16(\Index,64+\OffsetA)\r
+ addi BO, BO, DISP8(\Index,32+\OffsetB)\r
+ .else\r
+ addi AO, AO, DISP16(\Index,128)\r
+ addi BO, BO, DISP8(\Index,64)\r
+ .endif\r
+ .endif\r
+ \r
+\r
+.endm\r
+\r
+ \r
+\r
+.macro KERNEL4x8 First\r
+\r
+ lxv vs24, 0(BO)\r
+ lxv vs26, 16(BO)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+\r
+ lxv vs0, 0(AO)\r
+ lxv vs1, 16(AO)\r
+ lxv vs2, 32(AO)\r
+ lxv vs3, 48(AO) \r
+\r
+\r
+\r
+ \r
+ addi BO, BO, 32\r
+ addi AO, AO, 64\r
+\r
+.if \First==1\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+ \r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+ \r
+\r
+ xvmuldp vs48, vs0, vs26\r
+ xvmuldp vs49, vs1, vs26\r
+ xvmuldp vs50, vs2, vs26\r
+ xvmuldp vs51, vs3, vs26\r
+ \r
+\r
+ xvmuldp vs56, vs0, vs27\r
+ xvmuldp vs57, vs1, vs27\r
+ xvmuldp vs58, vs2, vs27\r
+ xvmuldp vs59, vs3, vs27\r
+ \r
+.else\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+ \r
+\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+ xvmaddadp vs49, vs1, vs26\r
+ xvmaddadp vs50, vs2, vs26\r
+ xvmaddadp vs51, vs3, vs26\r
+ \r
+\r
+\r
+ xvmaddadp vs56, vs0, vs27\r
+ xvmaddadp vs57, vs1, vs27\r
+ xvmaddadp vs58, vs2, vs27\r
+ xvmaddadp vs59, vs3, vs27\r
+\r
+\r
+.endif\r
+.endm\r
+\r
+ \r
+\r
+.macro SAVE4x8\r
+ add T2, CO, LDC\r
+ add T3, T2, LDC\r
+ add T4, T3, LDC\r
+#ifndef TRMMKERNEL\r
+ lxv vs0, 0(CO)\r
+ lxv vs2, 16(CO)\r
+#endif \r
+ xxpermdi vs8, vs40,vs32,1\r
+ xxpermdi vs9 ,vs32,vs40,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs4, 32(CO)\r
+ lxv vs6, 48(CO)\r
+#endif \r
+ xxpermdi vs10, vs41,vs33,1 \r
+ xxpermdi vs11 ,vs33,vs41,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs1, 0(T2)\r
+ lxv vs3, 16(T2)\r
+#endif \r
+ xxpermdi vs12, vs42,vs34,1\r
+ xxpermdi vs13 ,vs34,vs42,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs5, 32(T2)\r
+ lxv vs7, 48(T2)\r
+#endif \r
+ xxpermdi vs14, vs43,vs35,1 \r
+ xxpermdi vs15 ,vs35,vs43,1 \r
+ \r
+\r
+\r
+#ifndef TRMMKERNEL \r
+ xvmaddadp vs0, vs8, alpha_r \r
+ xvmaddadp vs1, vs9, alpha_r \r
+ xvmaddadp vs2, vs10, alpha_r \r
+ xvmaddadp vs3, vs11, alpha_r \r
+\r
+ xvmaddadp vs4, vs12, alpha_r \r
+ xvmaddadp vs5, vs13, alpha_r \r
+ xvmaddadp vs6, vs14, alpha_r \r
+ xvmaddadp vs7, vs15, alpha_r \r
+#else\r
+ xvmuldp vs0, vs8, alpha_r \r
+ xvmuldp vs1, vs9, alpha_r \r
+ xvmuldp vs2, vs10, alpha_r \r
+ xvmuldp vs3, vs11, alpha_r \r
+\r
+ xvmuldp vs4, vs12, alpha_r \r
+ xvmuldp vs5, vs13, alpha_r \r
+ xvmuldp vs6, vs14, alpha_r \r
+ xvmuldp vs7, vs15, alpha_r \r
+\r
+#endif\r
+ \r
+\r
+ stxv vs0, 0(CO)\r
+ stxv vs2, 16(CO)\r
+ stxv vs4, 32(CO)\r
+ stxv vs6, 48(CO)\r
+\r
+ \r
+ stxv vs1, 0(T2)\r
+ stxv vs3, 16(T2)\r
+ stxv vs5, 32(T2)\r
+ stxv vs7, 48(T2)\r
+ \r
+ \r
+ xxpermdi vs8, vs56,vs48,1\r
+ xxpermdi vs9 ,vs48,vs56,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs0, 0(T3)\r
+ lxv vs2, 16(T3)\r
+#endif \r
+ xxpermdi vs10, vs57,vs49,1 \r
+ xxpermdi vs11 ,vs49,vs57,1 \r
+#ifndef TRMMKERNEL \r
+ lxv vs4, 32(T3)\r
+ lxv vs6, 48(T3)\r
+#endif \r
+ xxpermdi vs12, vs58,vs50,1\r
+ xxpermdi vs13 ,vs50,vs58,1\r
+#ifndef TRMMKERNEL \r
+ lxv vs1, 0(T4)\r
+ lxv vs3, 16(T4)\r
+#endif \r
+ xxpermdi vs14, vs59,vs51,1 \r
+ xxpermdi vs15 ,vs51,vs59,1 \r
+#ifndef TRMMKERNEL \r
+ lxv vs5, 32(T4)\r
+ lxv vs7, 48(T4)\r
+ \r
+ \r
+ xvmaddadp vs0, vs8, alpha_r \r
+ xvmaddadp vs1, vs9, alpha_r \r
+ xvmaddadp vs2, vs10, alpha_r \r
+ xvmaddadp vs3, vs11, alpha_r \r
+ \r
+\r
+\r
+ xvmaddadp vs4, vs12, alpha_r \r
+ xvmaddadp vs5, vs13, alpha_r \r
+ xvmaddadp vs6, vs14, alpha_r \r
+ xvmaddadp vs7, vs15, alpha_r \r
+#else\r
+ xvmuldp vs0, vs8, alpha_r \r
+ xvmuldp vs1, vs9, alpha_r \r
+ xvmuldp vs2, vs10, alpha_r \r
+ xvmuldp vs3, vs11, alpha_r \r
+ \r
+\r
+\r
+ xvmuldp vs4, vs12, alpha_r \r
+ xvmuldp vs5, vs13, alpha_r \r
+ xvmuldp vs6, vs14, alpha_r \r
+ xvmuldp vs7, vs15, alpha_r \r
+\r
+#endif\r
+\r
+\r
+ stxv vs0, 0(T3)\r
+ stxv vs2, 16(T3)\r
+ stxv vs4, 32(T3)\r
+ stxv vs6, 48(T3)\r
+\r
+ \r
+ stxv vs1, 0(T4)\r
+ stxv vs3, 16(T4)\r
+ stxv vs5, 32(T4)\r
+ stxv vs7, 48(T4)\r
+ \r
+ \r
+\r
+ addi CO, CO, 64\r
+.endm\r
+\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=4 *\r
+*********************************************************************/\r
+\r
+.macro LOAD4x4_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 32\r
+\r
+.endm\r
+\r
+.macro KERNEL4x4_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+ lxvdsx vs30, o16, BO\r
+ lxvdsx vs31, o24, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+ xvmuldp vs49, vs1, vs26\r
+\r
+ xvmuldp vs56, vs0, vs27\r
+ xvmuldp vs57, vs1, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x4_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+ lxvdsx vs30, o16, BO\r
+ lxvdsx vs31, o24, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+ xvmaddadp vs49, vs1, vs26\r
+\r
+ xvmaddadp vs56, vs0, vs27\r
+ xvmaddadp vs57, vs1, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x4_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+\r
+ xvmaddadp vs48, vs8, vs30\r
+ xvmaddadp vs49, vs9, vs30\r
+\r
+ xvmaddadp vs56, vs8, vs31\r
+ xvmaddadp vs57, vs9, vs31\r
+\r
+.endm\r
+\r
+.macro KERNEL4x4_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+\r
+ xvmaddadp vs48, vs8, vs30\r
+ xvmaddadp vs49, vs9, vs30\r
+\r
+ xvmaddadp vs56, vs8, vs31\r
+ xvmaddadp vs57, vs9, vs31\r
+\r
+.endm\r
+\r
+.macro KERNEL4x4_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+ xvmuldp vs49, vs1, vs26\r
+\r
+ xvmuldp vs56, vs0, vs27\r
+ xvmuldp vs57, vs1, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x4_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+ xvmaddadp vs49, vs1, vs26\r
+\r
+ xvmaddadp vs56, vs0, vs27\r
+ xvmaddadp vs57, vs1, vs27\r
+\r
+.endm\r
+\r
+.macro SAVE4x4\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+ lxvd2x vs9, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs40, alpha_r\r
+ xvmaddadp vs9, vs41, alpha_r\r
+#else\r
+ xvmuldp vs8, vs40, alpha_r\r
+ xvmuldp vs9, vs41, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+ stxvd2x vs9, o16, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs48, alpha_r\r
+ xvmaddadp vs1, vs49, alpha_r\r
+#else\r
+ xvmuldp vs0, vs48, alpha_r\r
+ xvmuldp vs1, vs49, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+ lxvd2x vs9, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs56, alpha_r\r
+ xvmaddadp vs9, vs57, alpha_r\r
+#else\r
+ xvmuldp vs8, vs56, alpha_r\r
+ xvmuldp vs9, vs57, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+ stxvd2x vs9, o16, T1\r
+\r
+ addi CO, CO, 32\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=2 *\r
+*********************************************************************/\r
+\r
+.macro LOAD4x2_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 32\r
+\r
+.endm\r
+\r
+.macro KERNEL4x2_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+ lxvdsx vs30, o16, BO\r
+ lxvdsx vs31, o24, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+\r
+ xvmuldp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x2_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+ lxvdsx vs30, o16, BO\r
+ lxvdsx vs31, o24, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+\r
+ xvmaddadp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x2_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+\r
+ xvmaddadp vs48, vs8, vs30\r
+\r
+ xvmaddadp vs56, vs8, vs31\r
+\r
+.endm\r
+\r
+.macro KERNEL4x2_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+\r
+ xvmaddadp vs48, vs8, vs30\r
+\r
+ xvmaddadp vs56, vs8, vs31\r
+\r
+.endm\r
+\r
+.macro KERNEL4x2_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+\r
+ xvmuldp vs48, vs0, vs26\r
+\r
+ xvmuldp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x2_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+ lxvdsx vs26, o16, BO\r
+ lxvdsx vs27, o24, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 32\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+\r
+ xvmaddadp vs48, vs0, vs26\r
+\r
+ xvmaddadp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro SAVE4x2\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs40, alpha_r\r
+#else\r
+ xvmuldp vs8, vs40, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs48, alpha_r\r
+#else\r
+ xvmuldp vs0, vs48, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs56, alpha_r\r
+#else\r
+ xvmuldp vs8, vs56, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+\r
+ addi CO, CO, 16\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=1 *\r
+*********************************************************************/\r
+\r
+.macro LOAD4x1_1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+ lxsdx vs26, o16, BO\r
+ lxsdx vs27, o24, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 32\r
+\r
+.endm\r
+\r
+.macro KERNEL4x1_I1\r
+\r
+ lxsdx vs8, 0, AO\r
+\r
+ lxsdx vs28, 0, BO\r
+ lxsdx vs29, o8, BO\r
+ lxsdx vs30, o16, BO\r
+ lxsdx vs31, o24, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 32\r
+\r
+\r
+ xsmuldp vs32, vs0, vs24\r
+\r
+ xsmuldp vs40, vs0, vs25\r
+\r
+ xsmuldp vs48, vs0, vs26\r
+\r
+ xsmuldp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x1_1\r
+\r
+ lxsdx vs8, 0, AO\r
+\r
+ lxsdx vs28, 0, BO\r
+ lxsdx vs29, o8, BO\r
+ lxsdx vs30, o16, BO\r
+ lxsdx vs31, o24, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 32\r
+\r
+\r
+ xsmaddadp vs32, vs0, vs24\r
+\r
+ xsmaddadp vs40, vs0, vs25\r
+\r
+ xsmaddadp vs48, vs0, vs26\r
+\r
+ xsmaddadp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x1_2\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+ lxsdx vs26, o16, BO\r
+ lxsdx vs27, o24, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 32\r
+\r
+\r
+ xsmaddadp vs32, vs8, vs28\r
+\r
+ xsmaddadp vs40, vs8, vs29\r
+\r
+ xsmaddadp vs48, vs8, vs30\r
+\r
+ xsmaddadp vs56, vs8, vs31\r
+\r
+.endm\r
+\r
+.macro KERNEL4x1_E2\r
+\r
+\r
+ xsmaddadp vs32, vs8, vs28\r
+\r
+ xsmaddadp vs40, vs8, vs29\r
+\r
+ xsmaddadp vs48, vs8, vs30\r
+\r
+ xsmaddadp vs56, vs8, vs31\r
+\r
+.endm\r
+\r
+.macro KERNEL4x1_SUBI1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+ lxsdx vs26, o16, BO\r
+ lxsdx vs27, o24, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 32\r
+\r
+\r
+ xsmuldp vs32, vs0, vs24\r
+\r
+ xsmuldp vs40, vs0, vs25\r
+\r
+ xsmuldp vs48, vs0, vs26\r
+\r
+ xsmuldp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro KERNEL4x1_SUB1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+ lxsdx vs26, o16, BO\r
+ lxsdx vs27, o24, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 32\r
+\r
+\r
+ xsmaddadp vs32, vs0, vs24\r
+\r
+ xsmaddadp vs40, vs0, vs25\r
+\r
+ xsmaddadp vs48, vs0, vs26\r
+\r
+ xsmaddadp vs56, vs0, vs27\r
+\r
+.endm\r
+\r
+.macro SAVE4x1\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs0, vs32, alpha_r\r
+#else\r
+ xsmuldp vs0, vs32, alpha_r\r
+#endif\r
+\r
+ stxsdx vs0, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs8, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs8, vs40, alpha_r\r
+#else\r
+ xsmuldp vs8, vs40, alpha_r\r
+#endif\r
+\r
+ stxsdx vs8, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs0, vs48, alpha_r\r
+#else\r
+ xsmuldp vs0, vs48, alpha_r\r
+#endif\r
+\r
+ stxsdx vs0, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs8, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs8, vs56, alpha_r\r
+#else\r
+ xsmuldp vs8, vs56, alpha_r\r
+#endif\r
+\r
+ stxsdx vs8, 0, T1\r
+\r
+ addi CO, CO, 8\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=2, M=16 *\r
+*********************************************************************/\r
+\r
+.macro LOAD2x16_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+.endm\r
+\r
+.macro KERNEL2x16_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+ lxvd2x vs12, 0, AO\r
+ lxvd2x vs13, o16, AO\r
+ lxvd2x vs14, o32, AO\r
+ lxvd2x vs15, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+ xvmuldp vs36, vs4, vs24\r
+ xvmuldp vs37, vs5, vs24\r
+ xvmuldp vs38, vs6, vs24\r
+ xvmuldp vs39, vs7, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+ xvmuldp vs44, vs4, vs25\r
+ xvmuldp vs45, vs5, vs25\r
+ xvmuldp vs46, vs6, vs25\r
+ xvmuldp vs47, vs7, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x16_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+ lxvd2x vs12, 0, AO\r
+ lxvd2x vs13, o16, AO\r
+ lxvd2x vs14, o32, AO\r
+ lxvd2x vs15, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+ xvmaddadp vs36, vs4, vs24\r
+ xvmaddadp vs37, vs5, vs24\r
+ xvmaddadp vs38, vs6, vs24\r
+ xvmaddadp vs39, vs7, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+ xvmaddadp vs44, vs4, vs25\r
+ xvmaddadp vs45, vs5, vs25\r
+ xvmaddadp vs46, vs6, vs25\r
+ xvmaddadp vs47, vs7, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x16_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+ xvmaddadp vs36, vs12, vs28\r
+ xvmaddadp vs37, vs13, vs28\r
+ xvmaddadp vs38, vs14, vs28\r
+ xvmaddadp vs39, vs15, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+ xvmaddadp vs42, vs10, vs29\r
+ xvmaddadp vs43, vs11, vs29\r
+ xvmaddadp vs44, vs12, vs29\r
+ xvmaddadp vs45, vs13, vs29\r
+ xvmaddadp vs46, vs14, vs29\r
+ xvmaddadp vs47, vs15, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x16_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+ xvmaddadp vs36, vs12, vs28\r
+ xvmaddadp vs37, vs13, vs28\r
+ xvmaddadp vs38, vs14, vs28\r
+ xvmaddadp vs39, vs15, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+ xvmaddadp vs42, vs10, vs29\r
+ xvmaddadp vs43, vs11, vs29\r
+ xvmaddadp vs44, vs12, vs29\r
+ xvmaddadp vs45, vs13, vs29\r
+ xvmaddadp vs46, vs14, vs29\r
+ xvmaddadp vs47, vs15, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x16_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+ xvmuldp vs36, vs4, vs24\r
+ xvmuldp vs37, vs5, vs24\r
+ xvmuldp vs38, vs6, vs24\r
+ xvmuldp vs39, vs7, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+ xvmuldp vs44, vs4, vs25\r
+ xvmuldp vs45, vs5, vs25\r
+ xvmuldp vs46, vs6, vs25\r
+ xvmuldp vs47, vs7, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x16_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+ xvmaddadp vs36, vs4, vs24\r
+ xvmaddadp vs37, vs5, vs24\r
+ xvmaddadp vs38, vs6, vs24\r
+ xvmaddadp vs39, vs7, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+ xvmaddadp vs44, vs4, vs25\r
+ xvmaddadp vs45, vs5, vs25\r
+ xvmaddadp vs46, vs6, vs25\r
+ xvmaddadp vs47, vs7, vs25\r
+\r
+.endm\r
+\r
+.macro SAVE2x16\r
+\r
+ mr T1, CO\r
+ addi T2, T1, 64\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+ lxvd2x vs2, o32, T1\r
+ lxvd2x vs3, o48, T1\r
+\r
+ lxvd2x vs4, 0, T2\r
+ lxvd2x vs5, o16, T2\r
+ lxvd2x vs6, o32, T2\r
+ lxvd2x vs7, o48, T2\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+ xvmaddadp vs2, vs34, alpha_r\r
+ xvmaddadp vs3, vs35, alpha_r\r
+ xvmaddadp vs4, vs36, alpha_r\r
+ xvmaddadp vs5, vs37, alpha_r\r
+ xvmaddadp vs6, vs38, alpha_r\r
+ xvmaddadp vs7, vs39, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+ xvmuldp vs2, vs34, alpha_r\r
+ xvmuldp vs3, vs35, alpha_r\r
+ xvmuldp vs4, vs36, alpha_r\r
+ xvmuldp vs5, vs37, alpha_r\r
+ xvmuldp vs6, vs38, alpha_r\r
+ xvmuldp vs7, vs39, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+ stxvd2x vs2, o32, T1\r
+ stxvd2x vs3, o48, T1\r
+\r
+ stxvd2x vs4, 0, T2\r
+ stxvd2x vs5, o16, T2\r
+ stxvd2x vs6, o32, T2\r
+ stxvd2x vs7, o48, T2\r
+\r
+ add T1, T1, LDC\r
+ add T2, T2, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+ lxvd2x vs9, o16, T1\r
+ lxvd2x vs10, o32, T1\r
+ lxvd2x vs11, o48, T1\r
+\r
+ lxvd2x vs12, 0, T2\r
+ lxvd2x vs13, o16, T2\r
+ lxvd2x vs14, o32, T2\r
+ lxvd2x vs15, o48, T2\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs40, alpha_r\r
+ xvmaddadp vs9, vs41, alpha_r\r
+ xvmaddadp vs10, vs42, alpha_r\r
+ xvmaddadp vs11, vs43, alpha_r\r
+ xvmaddadp vs12, vs44, alpha_r\r
+ xvmaddadp vs13, vs45, alpha_r\r
+ xvmaddadp vs14, vs46, alpha_r\r
+ xvmaddadp vs15, vs47, alpha_r\r
+#else\r
+ xvmuldp vs8, vs40, alpha_r\r
+ xvmuldp vs9, vs41, alpha_r\r
+ xvmuldp vs10, vs42, alpha_r\r
+ xvmuldp vs11, vs43, alpha_r\r
+ xvmuldp vs12, vs44, alpha_r\r
+ xvmuldp vs13, vs45, alpha_r\r
+ xvmuldp vs14, vs46, alpha_r\r
+ xvmuldp vs15, vs47, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+ stxvd2x vs9, o16, T1\r
+ stxvd2x vs10, o32, T1\r
+ stxvd2x vs11, o48, T1\r
+\r
+ stxvd2x vs12, 0, T2\r
+ stxvd2x vs13, o16, T2\r
+ stxvd2x vs14, o32, T2\r
+ stxvd2x vs15, o48, T2\r
+\r
+ addi CO, CO, 128\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=8 *\r
+*********************************************************************/\r
+\r
+.macro LOAD2x8_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+ xvmaddadp vs42, vs10, vs29\r
+ xvmaddadp vs43, vs11, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+ xvmaddadp vs42, vs10, vs29\r
+ xvmaddadp vs43, vs11, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+ xvmuldp vs42, vs2, vs25\r
+ xvmuldp vs43, vs3, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+ xvmaddadp vs42, vs2, vs25\r
+ xvmaddadp vs43, vs3, vs25\r
+\r
+.endm\r
+\r
+.macro SAVE2x8\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+ lxvd2x vs2, o32, T1\r
+ lxvd2x vs3, o48, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+ xvmaddadp vs2, vs34, alpha_r\r
+ xvmaddadp vs3, vs35, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+ xvmuldp vs2, vs34, alpha_r\r
+ xvmuldp vs3, vs35, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+ stxvd2x vs2, o32, T1\r
+ stxvd2x vs3, o48, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+ lxvd2x vs9, o16, T1\r
+ lxvd2x vs10, o32, T1\r
+ lxvd2x vs11, o48, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs40, alpha_r\r
+ xvmaddadp vs9, vs41, alpha_r\r
+ xvmaddadp vs10, vs42, alpha_r\r
+ xvmaddadp vs11, vs43, alpha_r\r
+#else\r
+ xvmuldp vs8, vs40, alpha_r\r
+ xvmuldp vs9, vs41, alpha_r\r
+ xvmuldp vs10, vs42, alpha_r\r
+ xvmuldp vs11, vs43, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+ stxvd2x vs9, o16, T1\r
+ stxvd2x vs10, o32, T1\r
+ stxvd2x vs11, o48, T1\r
+\r
+ addi CO, CO, 64\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=2, M=4 *\r
+*********************************************************************/\r
+\r
+.macro LOAD2x4_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 16\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+ xvmaddadp vs41, vs9, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+ xvmuldp vs41, vs1, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+ xvmaddadp vs41, vs1, vs25\r
+\r
+.endm\r
+\r
+.macro SAVE2x4\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+ lxvd2x vs9, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs40, alpha_r\r
+ xvmaddadp vs9, vs41, alpha_r\r
+#else\r
+ xvmuldp vs8, vs40, alpha_r\r
+ xvmuldp vs9, vs41, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+ stxvd2x vs9, o16, T1\r
+\r
+ addi CO, CO, 32\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=2, M=2 *\r
+*********************************************************************/\r
+\r
+.macro LOAD2x2_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 16\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+ lxvdsx vs29, o8, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+\r
+ xvmaddadp vs40, vs8, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+\r
+ xvmuldp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x2_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+ lxvdsx vs25, o8, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 16\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+\r
+ xvmaddadp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro SAVE2x2\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs8, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs8, vs40, alpha_r\r
+#else\r
+ xvmuldp vs8, vs40, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs8, 0, T1\r
+\r
+ addi CO, CO, 16\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=2, M=1 *\r
+*********************************************************************/\r
+\r
+.macro LOAD2x1_1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 16\r
+\r
+.endm\r
+\r
+.macro KERNEL2x1_I1\r
+\r
+ lxsdx vs8, 0, AO\r
+\r
+ lxsdx vs28, 0, BO\r
+ lxsdx vs29, o8, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 16\r
+\r
+\r
+ xsmuldp vs32, vs0, vs24\r
+\r
+ xsmuldp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x1_1\r
+\r
+ lxsdx vs8, 0, AO\r
+\r
+ lxsdx vs28, 0, BO\r
+ lxsdx vs29, o8, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 16\r
+\r
+\r
+ xsmaddadp vs32, vs0, vs24\r
+\r
+ xsmaddadp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x1_2\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 16\r
+\r
+\r
+ xsmaddadp vs32, vs8, vs28\r
+\r
+ xsmaddadp vs40, vs8, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x1_E2\r
+\r
+\r
+ xsmaddadp vs32, vs8, vs28\r
+\r
+ xsmaddadp vs40, vs8, vs29\r
+\r
+.endm\r
+\r
+.macro KERNEL2x1_SUBI1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 16\r
+\r
+\r
+ xsmuldp vs32, vs0, vs24\r
+\r
+ xsmuldp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro KERNEL2x1_SUB1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+ lxsdx vs25, o8, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 16\r
+\r
+\r
+ xsmaddadp vs32, vs0, vs24\r
+\r
+ xsmaddadp vs40, vs0, vs25\r
+\r
+.endm\r
+\r
+.macro SAVE2x1\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs0, vs32, alpha_r\r
+#else\r
+ xsmuldp vs0, vs32, alpha_r\r
+#endif\r
+\r
+ stxsdx vs0, 0, T1\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs8, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs8, vs40, alpha_r\r
+#else\r
+ xsmuldp vs8, vs40, alpha_r\r
+#endif\r
+\r
+ stxsdx vs8, 0, T1\r
+\r
+ addi CO, CO, 8\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=1, M=16 *\r
+*********************************************************************/\r
+\r
+.macro LOAD1x16_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+.endm\r
+\r
+.macro KERNEL1x16_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+ lxvd2x vs12, 0, AO\r
+ lxvd2x vs13, o16, AO\r
+ lxvd2x vs14, o32, AO\r
+ lxvd2x vs15, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+ xvmuldp vs36, vs4, vs24\r
+ xvmuldp vs37, vs5, vs24\r
+ xvmuldp vs38, vs6, vs24\r
+ xvmuldp vs39, vs7, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x16_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+ lxvd2x vs12, 0, AO\r
+ lxvd2x vs13, o16, AO\r
+ lxvd2x vs14, o32, AO\r
+ lxvd2x vs15, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+ xvmaddadp vs36, vs4, vs24\r
+ xvmaddadp vs37, vs5, vs24\r
+ xvmaddadp vs38, vs6, vs24\r
+ xvmaddadp vs39, vs7, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x16_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+ xvmaddadp vs36, vs12, vs28\r
+ xvmaddadp vs37, vs13, vs28\r
+ xvmaddadp vs38, vs14, vs28\r
+ xvmaddadp vs39, vs15, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x16_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+ xvmaddadp vs36, vs12, vs28\r
+ xvmaddadp vs37, vs13, vs28\r
+ xvmaddadp vs38, vs14, vs28\r
+ xvmaddadp vs39, vs15, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x16_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+ xvmuldp vs36, vs4, vs24\r
+ xvmuldp vs37, vs5, vs24\r
+ xvmuldp vs38, vs6, vs24\r
+ xvmuldp vs39, vs7, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x16_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+ lxvd2x vs4, 0, AO\r
+ lxvd2x vs5, o16, AO\r
+ lxvd2x vs6, o32, AO\r
+ lxvd2x vs7, o48, AO\r
+\r
+ addi AO, AO, 64\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+ xvmaddadp vs36, vs4, vs24\r
+ xvmaddadp vs37, vs5, vs24\r
+ xvmaddadp vs38, vs6, vs24\r
+ xvmaddadp vs39, vs7, vs24\r
+\r
+.endm\r
+\r
+.macro SAVE1x16\r
+\r
+ mr T1, CO\r
+ addi T2, T1, 64\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+ lxvd2x vs2, o32, T1\r
+ lxvd2x vs3, o48, T1\r
+\r
+ lxvd2x vs4, 0, T2\r
+ lxvd2x vs5, o16, T2\r
+ lxvd2x vs6, o32, T2\r
+ lxvd2x vs7, o48, T2\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+ xvmaddadp vs2, vs34, alpha_r\r
+ xvmaddadp vs3, vs35, alpha_r\r
+ xvmaddadp vs4, vs36, alpha_r\r
+ xvmaddadp vs5, vs37, alpha_r\r
+ xvmaddadp vs6, vs38, alpha_r\r
+ xvmaddadp vs7, vs39, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+ xvmuldp vs2, vs34, alpha_r\r
+ xvmuldp vs3, vs35, alpha_r\r
+ xvmuldp vs4, vs36, alpha_r\r
+ xvmuldp vs5, vs37, alpha_r\r
+ xvmuldp vs6, vs38, alpha_r\r
+ xvmuldp vs7, vs39, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+ stxvd2x vs2, o32, T1\r
+ stxvd2x vs3, o48, T1\r
+\r
+ stxvd2x vs4, 0, T2\r
+ stxvd2x vs5, o16, T2\r
+ stxvd2x vs6, o32, T2\r
+ stxvd2x vs7, o48, T2\r
+\r
+ addi CO, CO, 128\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=4, M=8 *\r
+*********************************************************************/\r
+\r
+.macro LOAD1x8_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+ lxvd2x vs10, o32, AO\r
+ lxvd2x vs11, o48, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+ xvmaddadp vs34, vs10, vs28\r
+ xvmaddadp vs35, vs11, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+ xvmuldp vs34, vs2, vs24\r
+ xvmuldp vs35, vs3, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+ lxvd2x vs2, o32, AO\r
+ lxvd2x vs3, o48, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 64\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+ xvmaddadp vs34, vs2, vs24\r
+ xvmaddadp vs35, vs3, vs24\r
+\r
+.endm\r
+\r
+.macro SAVE1x8\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+ lxvd2x vs2, o32, T1\r
+ lxvd2x vs3, o48, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+ xvmaddadp vs2, vs34, alpha_r\r
+ xvmaddadp vs3, vs35, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+ xvmuldp vs2, vs34, alpha_r\r
+ xvmuldp vs3, vs35, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+ stxvd2x vs2, o32, T1\r
+ stxvd2x vs3, o48, T1\r
+\r
+ addi CO, CO, 64\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=1, M=4 *\r
+*********************************************************************/\r
+\r
+.macro LOAD1x4_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 8\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+ lxvd2x vs9, o16, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+ xvmaddadp vs33, vs9, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+ xvmuldp vs33, vs1, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+ lxvd2x vs1, o16, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 32\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+ xvmaddadp vs33, vs1, vs24\r
+\r
+.endm\r
+\r
+.macro SAVE1x4\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+ lxvd2x vs1, o16, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+ xvmaddadp vs1, vs33, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+ xvmuldp vs1, vs33, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+ stxvd2x vs1, o16, T1\r
+\r
+ addi CO, CO, 32\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=1, M=2 *\r
+*********************************************************************/\r
+\r
+.macro LOAD1x2_1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 8\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2_I1\r
+\r
+ lxvd2x vs8, 0, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2_1\r
+\r
+ lxvd2x vs8, 0, AO\r
+\r
+ lxvdsx vs28, 0, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2_2\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2_E2\r
+\r
+\r
+ xvmaddadp vs32, vs8, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2_SUBI1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmuldp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2_SUB1\r
+\r
+ lxvd2x vs0, 0, AO\r
+\r
+ lxvdsx vs24, 0, BO\r
+\r
+ addi AO, AO, 16\r
+ addi BO, BO, 8\r
+\r
+\r
+ xvmaddadp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro SAVE1x2\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxvd2x vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xvmaddadp vs0, vs32, alpha_r\r
+#else\r
+ xvmuldp vs0, vs32, alpha_r\r
+#endif\r
+\r
+ stxvd2x vs0, 0, T1\r
+\r
+ addi CO, CO, 16\r
+\r
+.endm\r
+\r
+/*********************************************************************\r
+* Macros for N=1, M=1 *\r
+*********************************************************************/\r
+\r
+.macro LOAD1x1_1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 8\r
+\r
+.endm\r
+\r
+.macro KERNEL1x1_I1\r
+\r
+ lxsdx vs8, 0, AO\r
+\r
+ lxsdx vs28, 0, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 8\r
+\r
+\r
+ xsmuldp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x1_1\r
+\r
+ lxsdx vs8, 0, AO\r
+\r
+ lxsdx vs28, 0, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 8\r
+\r
+\r
+ xsmaddadp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x1_2\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 8\r
+\r
+\r
+ xsmaddadp vs32, vs8, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x1_E2\r
+\r
+\r
+ xsmaddadp vs32, vs8, vs28\r
+\r
+.endm\r
+\r
+.macro KERNEL1x1_SUBI1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 8\r
+\r
+\r
+ xsmuldp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro KERNEL1x1_SUB1\r
+\r
+ lxsdx vs0, 0, AO\r
+\r
+ lxsdx vs24, 0, BO\r
+\r
+ addi AO, AO, 8\r
+ addi BO, BO, 8\r
+\r
+\r
+ xsmaddadp vs32, vs0, vs24\r
+\r
+.endm\r
+\r
+.macro SAVE1x1\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+ lxsdx vs0, 0, T1\r
+#endif\r
+\r
+#ifndef TRMMKERNEL\r
+ xsmaddadp vs0, vs32, alpha_r\r
+#else\r
+ xsmuldp vs0, vs32, alpha_r\r
+#endif\r
+\r
+ stxsdx vs0, 0, T1\r
+\r
+ addi CO, CO, 8\r
+\r
+.endm\r
+\r
+\r
+\r
+\r
+/****************************TRMM POINTER REFRESH MACROSES*************************/\r
+\r
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL\r
+ .if \SHIFT_VAL==16 \r
+ slwi \REG1, \REG2, 7 \r
+ .elseif \SHIFT_VAL==8 \r
+ slwi \REG1, \REG2, 6 \r
+ .elseif \SHIFT_VAL==4\r
+ slwi \REG1, \REG2, 5 \r
+ .elseif \SHIFT_VAL==2\r
+ slwi \REG1, \REG2, 4 \r
+ .elseif \SHIFT_VAL==1\r
+ slwi \REG1, \REG2, 3 \r
+ .endif\r
+.endm\r
+\r
+/*\r
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+// ptrbb = bb;\r
+// #else\r
+// ptrba += off*16;\r
+// ptrbb = bb + off*2;\r
+// #endif\r
+*/\r
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B\r
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+ /* ptrbb = bb;*/\r
+ mr \PTR_B,\B_VAL /* refresh BPOINT */\r
+\r
+ #else\r
+ /*\r
+ // ptrba =ptrba+ off*C_A;\r
+ // ptrbb = bb + off*C_B; \r
+ */\r
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */\r
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */\r
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */\r
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */\r
+ #endif \r
+.endm\r
+\r
+\r
+/*\r
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+// temp = bk-off;\r
+// #elif defined(LEFT)\r
+// temp = off+16; // number of values in A\r
+// #else\r
+// temp = off+2; // number of values in B\r
+// #endif\r
+*/\r
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B\r
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ /* temp = bk-off;*/\r
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL\r
+\r
+ #elif defined(LEFT)\r
+ /* temp = off+INCR_A; // number of values in A */\r
+ addi \TEMP_BK, \OFF_VAL, \INCR_A\r
+ #else\r
+ /* temp = off+INCR_B // number of values in B*/\r
+ addi \TEMP_BK,\OFF_VAL, \INCR_B\r
+ #endif\r
+\r
+.endm\r
+/*\r
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+// temp = bk - off;\r
+// #ifdef LEFT\r
+// temp -= 16; // number of values in A\r
+// #else\r
+// temp -= 2; // number of values in B\r
+// #endif\r
+// ptrba += temp*16;\r
+// ptrbb += temp*2;\r
+// #endif\r
+\r
+// #ifdef LEFT\r
+// off += 16; // number of values in A\r
+// #endif\r
+*/\r
+ \r
+\r
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B\r
+\r
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+ /*temp = bk - off;*/\r
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL\r
+ #ifdef LEFT\r
+ /*temp -= 8; // number of values in A*/\r
+ addi \TEMP_BK,\TEMP_BK,-\C_A\r
+ #else\r
+ /*temp -= 4; // number of values in B*/\r
+ addi \TEMP_BK,\TEMP_BK,-\C_B \r
+ #endif\r
+ /*ptrba += temp*C_A;\r
+ ptrbb += temp*C_B;*/ \r
+ SHIFT_REG T4,\TEMP_BK,\C_A\r
+ SHIFT_REG T2,\TEMP_BK,\C_B\r
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ \r
+ add \PTR_B, \PTR_B,T2 \r
+\r
+ #endif\r
+\r
+ #ifdef LEFT\r
+ /*off += 8; // number of values in A*/\r
+ addi \OFF_VAL,\OFF_VAL,\C_A\r
+ #endif\r
+.endm
\ No newline at end of file