--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+\r
+/**************************************************************************************\r
+* Abdelrauf(quickwritereader@gmail.com)\r
+* BLASTEST : OK\r
+* CTEST : OK\r
+* TEST : OK\r
+* LAPACK-TEST : OK\r
+**************************************************************************************/\r
+#define MY_ALIGN .align 3\r
+b CGEMM_L4\r
+/* MINI SUBROUTINES */ \r
+/* 4x8 MAIN 128x+2 LOOP */ \r
+\r
+\r
+CGEMM_L4x8_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD4x8_2 \r
+ MY_ALIGN\r
+CGEMM_L4x8_LOOP:\r
+/*----------------------------------------*/ \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL4x8_L2 128,64,0,0 \r
+CGEMM_L4x8_K128:\r
+/*----------------------------------------*/ \r
+ KERNEL4x8_L2 128,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL4x8_L2 128,64,2,0\r
+ KERNEL4x8_L2 128,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL4x8_L2 128,64,4,0\r
+ KERNEL4x8_L2 128,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL4x8_L2 128,64,6,0\r
+ KERNEL4x8_L2 128,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL4x8_L2 128,64,8,0\r
+ KERNEL4x8_L2 128,64,9,0\r
+ KERNEL4x8_L2 128,64,10,0\r
+ KERNEL4x8_L2 128,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL4x8_L2 128,64,12,0\r
+ KERNEL4x8_L2 128,64,13,0\r
+ KERNEL4x8_L2 128,64,14,0\r
+ KERNEL4x8_L2 128,64,15,0 \r
+ KERNEL4x8_L2 128,64,16,0\r
+ KERNEL4x8_L2 128,64,17,0 \r
+ KERNEL4x8_L2 128,64,18,0\r
+ KERNEL4x8_L2 128,64,19,0 \r
+ KERNEL4x8_L2 128,64,20,0\r
+ KERNEL4x8_L2 128,64,21,0 \r
+ KERNEL4x8_L2 128,64,22,0\r
+ KERNEL4x8_L2 128,64,23,0 \r
+ KERNEL4x8_L2 128,64,24,0\r
+ KERNEL4x8_L2 128,64,25,0\r
+ KERNEL4x8_L2 128,64,26,0\r
+ KERNEL4x8_L2 128,64,27,0 \r
+ KERNEL4x8_L2 128,64,28,0\r
+ KERNEL4x8_L2 128,64,29,0\r
+ KERNEL4x8_L2 128,64,30,0\r
+ KERNEL4x8_L2 128,64,31,0 \r
+ KERNEL4x8_L2 128,64,32,0\r
+ KERNEL4x8_L2 128,64,33,0\r
+ KERNEL4x8_L2 128,64,34,0\r
+ KERNEL4x8_L2 128,64,35,0 \r
+ KERNEL4x8_L2 128,64,36,0\r
+ KERNEL4x8_L2 128,64,37,0\r
+ KERNEL4x8_L2 128,64,38,0\r
+ KERNEL4x8_L2 128,64,39,0 \r
+ KERNEL4x8_L2 128,64,40,0\r
+ KERNEL4x8_L2 128,64,41,0\r
+ KERNEL4x8_L2 128,64,42,0\r
+ KERNEL4x8_L2 128,64,43,0 \r
+ KERNEL4x8_L2 128,64,44,0\r
+ KERNEL4x8_L2 128,64,45,0\r
+ KERNEL4x8_L2 128,64,46,0\r
+ KERNEL4x8_L2 128,64,47,0 \r
+ KERNEL4x8_L2 128,64,48,0\r
+ KERNEL4x8_L2 128,64,49,0 \r
+ KERNEL4x8_L2 128,64,50,0\r
+ KERNEL4x8_L2 128,64,51,0 \r
+ KERNEL4x8_L2 128,64,52,0\r
+ KERNEL4x8_L2 128,64,53,0 \r
+ KERNEL4x8_L2 128,64,54,0\r
+ KERNEL4x8_L2 128,64,55,0 \r
+ KERNEL4x8_L2 128,64,56,0\r
+ KERNEL4x8_L2 128,64,57,0\r
+ KERNEL4x8_L2 128,64,58,0\r
+ KERNEL4x8_L2 128,64,59,0 \r
+ KERNEL4x8_L2 128,64,60,0\r
+ KERNEL4x8_L2 128,64,61,0\r
+ KERNEL4x8_L2 128,64,62,0 \r
+ KERNEL4x8_L2 128,64,63,1 \r
+ bdnz CGEMM_L4x8_LOOP\r
+ MY_ALIGN \r
+CGEMM_L4x8_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END4x8_2\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x8_L64_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL4x8_L2 128,64,0,0 \r
+ KERNEL4x8_L2 128,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL4x8_L2 128,64,2,0\r
+ KERNEL4x8_L2 128,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL4x8_L2 128,64,4,0\r
+ KERNEL4x8_L2 128,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL4x8_L2 128,64,6,0\r
+ KERNEL4x8_L2 128,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL4x8_L2 128,64,8,0\r
+ KERNEL4x8_L2 128,64,9,0\r
+ KERNEL4x8_L2 128,64,10,0\r
+ KERNEL4x8_L2 128,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL4x8_L2 128,64,12,0\r
+ KERNEL4x8_L2 128,64,13,0\r
+ KERNEL4x8_L2 128,64,14,0\r
+ KERNEL4x8_L2 128,64,15,0 \r
+ KERNEL4x8_L2 128,64,16,0\r
+ KERNEL4x8_L2 128,64,17,0 \r
+ KERNEL4x8_L2 128,64,18,0\r
+ KERNEL4x8_L2 128,64,19,0 \r
+ KERNEL4x8_L2 128,64,20,0\r
+ KERNEL4x8_L2 128,64,21,0 \r
+ KERNEL4x8_L2 128,64,22,0\r
+ KERNEL4x8_L2 128,64,23,0 \r
+ KERNEL4x8_L2 128,64,24,0\r
+ KERNEL4x8_L2 128,64,25,0\r
+ KERNEL4x8_L2 128,64,26,0\r
+ KERNEL4x8_L2 128,64,27,0 \r
+ KERNEL4x8_L2 128,64,28,0\r
+ KERNEL4x8_L2 128,64,29,0\r
+ KERNEL4x8_L2 128,64,30,0\r
+ KERNEL4x8_E2 128,64,31,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x8_L32_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL4x8_L2 128,64,0,0 \r
+ KERNEL4x8_L2 128,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL4x8_L2 128,64,2,0\r
+ KERNEL4x8_L2 128,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL4x8_L2 128,64,4,0\r
+ KERNEL4x8_L2 128,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL4x8_L2 128,64,6,0\r
+ KERNEL4x8_L2 128,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL4x8_L2 128,64,8,0\r
+ KERNEL4x8_L2 128,64,9,0\r
+ KERNEL4x8_L2 128,64,10,0\r
+ KERNEL4x8_L2 128,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL4x8_L2 128,64,12,0\r
+ KERNEL4x8_L2 128,64,13,0\r
+ KERNEL4x8_L2 128,64,14,0\r
+ KERNEL4x8_E2 128,64,15,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x8_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL4x8_L2 128,64,0,0 \r
+ KERNEL4x8_L2 128,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL4x8_L2 128,64,2,0\r
+ KERNEL4x8_L2 128,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL4x8_L2 128,64,4,0\r
+ KERNEL4x8_L2 128,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL4x8_L2 128,64,6,0\r
+ KERNEL4x8_E2 128,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x4_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD4x4_2 \r
+ MY_ALIGN\r
+CGEMM_L4x4_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL4x4_L2 64,64,0,0\r
+CGEMM_L4x4_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL4x4_L2 64,64,1,0 \r
+ KERNEL4x4_L2 64,64,2,0\r
+ KERNEL4x4_L2 64,64,3,0 \r
+ KERNEL4x4_L2 64,64,4,0\r
+ KERNEL4x4_L2 64,64,5,0 \r
+ KERNEL4x4_L2 64,64,6,0\r
+ KERNEL4x4_L2 64,64,7,0\r
+ KERNEL4x4_L2 64,64,8,0\r
+ KERNEL4x4_L2 64,64,9,0 \r
+ KERNEL4x4_L2 64,64,10,0\r
+ KERNEL4x4_L2 64,64,11,0 \r
+ KERNEL4x4_L2 64,64,12,0\r
+ KERNEL4x4_L2 64,64,13,0 \r
+ KERNEL4x4_L2 64,64,14,0\r
+ KERNEL4x4_L2 64,64,15,1 \r
+ bdnz CGEMM_L4x4_LOOP\r
+ MY_ALIGN \r
+CGEMM_L4x4_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END4x4_2 \r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x4_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x4_2\r
+ KERNEL4x4_L2 64,64,0,0\r
+ KERNEL4x4_L2 64,64,1,0 \r
+ KERNEL4x4_L2 64,64,2,0\r
+ KERNEL4x4_L2 64,64,3,0 \r
+ KERNEL4x4_L2 64,64,4,0\r
+ KERNEL4x4_L2 64,64,5,0 \r
+ KERNEL4x4_L2 64,64,6,0\r
+ KERNEL4x4_E2 64,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x4_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x4_2\r
+ KERNEL4x4_L2 64,64,0,0\r
+ KERNEL4x4_L2 64,64,1,0 \r
+ KERNEL4x4_L2 64,64,2,0\r
+ KERNEL4x4_E2 64,64,3,1 \r
+ blr\r
+\r
+\r
+CGEMM_4x2_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD4x2_2 \r
+ MY_ALIGN \r
+CGEMM_L4x2_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL4x2_L2 32,64,0,0 \r
+CGEMM_L4x2_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL4x2_L2 32,64,1,0 \r
+ KERNEL4x2_L2 32,64,2,0\r
+ KERNEL4x2_L2 32,64,3,0 \r
+ KERNEL4x2_L2 32,64,4,0\r
+ KERNEL4x2_L2 32,64,5,0 \r
+ KERNEL4x2_L2 32,64,6,0\r
+ KERNEL4x2_L2 32,64,7,0\r
+ KERNEL4x2_L2 32,64,8,0\r
+ KERNEL4x2_L2 32,64,9,0 \r
+ KERNEL4x2_L2 32,64,10,0\r
+ KERNEL4x2_L2 32,64,11,0 \r
+ KERNEL4x2_L2 32,64,12,0\r
+ KERNEL4x2_L2 32,64,13,0 \r
+ KERNEL4x2_L2 32,64,14,0\r
+ KERNEL4x2_L2 32,64,15,1 \r
+ bdnz CGEMM_L4x2_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x2_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END4x2_2 \r
+ blr\r
+ MY_ALIGN\r
+CGEMM_4x2_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x2_2\r
+ KERNEL4x2_L2 32,64,0,0\r
+ KERNEL4x2_L2 32,64,1,0 \r
+ KERNEL4x2_L2 32,64,2,0\r
+ KERNEL4x2_L2 32,64,3,0 \r
+ KERNEL4x2_L2 32,64,4,0\r
+ KERNEL4x2_L2 32,64,5,0 \r
+ KERNEL4x2_L2 32,64,6,0\r
+ KERNEL4x2_E2 32,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+CGEMM_4x2_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x2_2\r
+ KERNEL4x2_L2 32,64,0,0\r
+ KERNEL4x2_L2 32,64,1,0 \r
+ KERNEL4x2_L2 32,64,2,0\r
+ KERNEL4x2_E2 32,64,3,1 \r
+ blr\r
+\r
+\r
+CGEMM_4x1_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD4x1_2 \r
+ MY_ALIGN\r
+CGEMM_L4x1_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL4x1_L2 16,64,0,0 \r
+CGEMM_L4x1_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL4x1_L2 16,64,1,0 \r
+ KERNEL4x1_L2 16,64,2,0\r
+ KERNEL4x1_L2 16,64,3,0 \r
+ KERNEL4x1_L2 16,64,4,0\r
+ KERNEL4x1_L2 16,64,5,0 \r
+ KERNEL4x1_L2 16,64,6,0\r
+ KERNEL4x1_L2 16,64,7,0\r
+ KERNEL4x1_L2 16,64,8,0\r
+ KERNEL4x1_L2 16,64,9,0 \r
+ KERNEL4x1_L2 16,64,10,0\r
+ KERNEL4x1_L2 16,64,11,0 \r
+ KERNEL4x1_L2 16,64,12,0\r
+ KERNEL4x1_L2 16,64,13,0 \r
+ KERNEL4x1_L2 16,64,14,0\r
+ KERNEL4x1_L2 16,64,15,1 \r
+ bdnz CGEMM_L4x1_LOOP\r
+ MY_ALIGN \r
+CGEMM_L4x1_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END4x1_2 \r
+ blr\r
+\r
+ MY_ALIGN\r
+CGEMM_4x1_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x1_2\r
+ KERNEL4x1_L2 16,64,0,0\r
+ KERNEL4x1_L2 16,64,1,0 \r
+ KERNEL4x1_L2 16,64,2,0\r
+ KERNEL4x1_L2 16,64,3,0 \r
+ KERNEL4x1_L2 16,64,4,0\r
+ KERNEL4x1_L2 16,64,5,0 \r
+ KERNEL4x1_L2 16,64,6,0\r
+ KERNEL4x1_E2 16,64,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_4x1_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD4x1_2\r
+ KERNEL4x1_L2 16,64,0,0\r
+ KERNEL4x1_L2 16,64,1,0 \r
+ KERNEL4x1_L2 16,64,2,0\r
+ KERNEL4x1_E2 16,64,3,1 \r
+ blr\r
+\r
+\r
+\r
+/* MAIN LOOP BEGINS */ \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ neg TEMP_REG, OFFSET \r
+#endif \r
+ srawi. J, N, 2\r
+ ble CGEMM_L4_END\r
+\r
+\r
+CGEMM_L4_BEGIN:\r
+/*----------------------------------------*/ \r
+ mr CO, C\r
+ slwi T1, LDC , 2 \r
+ add T2,C,LDC \r
+ mr AO, A \r
+ add C, C, T1\r
+#if defined(TRMMKERNEL) && defined(LEFT) \r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ srawi. I, M, 3\r
+ ble CGEMM_L4x8_END\r
+ dcbt CO,r0 /*just prefetch*/\r
+ dcbt T2,r0 \r
+\r
+\r
+CGEMM_L4x8_BEGIN:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4\r
+#else \r
+ mr BO, B \r
+ dcbt B, r0 \r
+#endif \r
+ dcbt AO, r0\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,4\r
+ mr T1, T6\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(T1-2) % 128x */\r
+#else \r
+ mr T1, K\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(K-2) % 128x */\r
+#endif \r
+ ZERO4x8 \r
+ ble CGEMM_L4x8_SUB0\r
+ bl CGEMM_L4x8_LMAIN_SUB\r
+ andi. L, T1, 127\r
+ ble CGEMM_L4x8_SAVE\r
+ b CGEMM_L4x8_SUB2\r
+\r
+\r
+CGEMM_L4x8_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 255\r
+ cmpwi T6,129\r
+#else \r
+ andi. L, K, 255\r
+ cmpwi K,129\r
+#endif \r
+ li T8,1\r
+ bne CMP4x8_128K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-64 \r
+ LOAD4x8O 64,32 \r
+ END4x8_WITHOUT_ADD \r
+ LOAD4x8_2O 128, 64 \r
+ mtctr T8 \r
+ bl CGEMM_L4x8_K128 \r
+ b CGEMM_L4x8_SAVE \r
+ CMP4x8_128K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,128\r
+#else \r
+ cmpwi K,128\r
+#endif \r
+ bne CGEMM_L4x8_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-128 \r
+ LOAD4x8_2O 128,64\r
+ bl CGEMM_L4x8_K128 \r
+ b CGEMM_L4x8_SAVE \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x8_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 64\r
+ ble CGEMM_L4x8_SUB2_32\r
+ bl CGEMM_4x8_L64_SUB\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x8_SUB2_32:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 32\r
+ ble CGEMM_L4x8_SUB2_16 \r
+ bl CGEMM_4x8_L32_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x8_SUB2_16:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L4x8_SUB2_8\r
+ bl CGEMM_4x8_L16_SUB \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x8_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L4x8_SUB2_4\r
+ LOAD4x8_2\r
+ KERNEL4x8_L2 128,64, 0,0\r
+ KERNEL4x8_L2 128,64, 1,0\r
+ KERNEL4x8_L2 128,64, 2,0\r
+ KERNEL4x8_E2 128,64, 3,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x8_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L4x8_SUB2_2\r
+ LOAD4x8_2\r
+ KERNEL4x8_L2 128,64, 0,0\r
+ KERNEL4x8_E2 128,64, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x8_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L4x8_SUB2_1\r
+ LOAD4x8_2 \r
+ KERNEL4x8_E2 128,64, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x8_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L4x8_SAVE \r
+ KERNEL4x8\r
+\r
+ MY_ALIGN\r
+CGEMM_L4x8_SAVE:\r
+/*----------------------------------------*/ \r
+ addic. I, I, -1\r
+ MY_ALIGN\r
+ SAVE4x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4\r
+#endif \r
+ bgt CGEMM_L4x8_BEGIN\r
+ andi. T2, M, 7\r
+ ble CGEMM_L4x1_END\r
+ andi. T1, M, 4\r
+ ble CGEMM_L4x4_END\r
+ b CGEMM_L4x4_BEGIN\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x8_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L4x4_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T2, M, 7\r
+ ble CGEMM_L4x1_END\r
+ andi. T1, M, 4\r
+ ble CGEMM_L4x4_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,4\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO4x4\r
+ ble CGEMM_L4x4_SUB0 \r
+ bl CGEMM_4x4_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L4x4_SAVE\r
+ b CGEMM_L4x4_SUB2\r
+\r
+\r
+CGEMM_L4x4_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP4x4_32K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-32 \r
+ LOAD4x4O 32,32 \r
+ END4x4_WITHOUT_ADD \r
+ LOAD4x4_2O 64, 64 \r
+ mtctr T8 \r
+ bl CGEMM_L4x4_K32 \r
+ b CGEMM_L4x4_SAVE \r
+ CMP4x4_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L4x4_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-64 \r
+ LOAD4x4_2O 64,64\r
+ bl CGEMM_L4x4_K32 \r
+ b CGEMM_L4x4_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x4_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L4x4_SUB2_8\r
+ bl CGEMM_4x4_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x4_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L4x4_SUB2_4\r
+ bl CGEMM_4x4_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x4_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L4x4_SUB2_2\r
+ LOAD4x4_2\r
+ KERNEL4x4_L2 64,64, 0,0\r
+ KERNEL4x4_E2 64,64, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x4_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L4x4_SUB2_1\r
+ LOAD4x4_2\r
+ KERNEL4x4_E2 64,64, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x4_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L4x4_SAVE \r
+ KERNEL4x4\r
+\r
+\r
+CGEMM_L4x4_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE4x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4\r
+#endif \r
+\r
+\r
+CGEMM_L4x4_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L4x2_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 2\r
+ ble CGEMM_L4x2_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,4\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO4x2\r
+ ble CGEMM_L4x2_SUB0 \r
+ bl CGEMM_4x2_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L4x2_SAVE\r
+ b CGEMM_L4x2_SUB2\r
+\r
+\r
+CGEMM_L4x2_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP4x2_32K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-16 \r
+ LOAD4x2O 16,32 \r
+ END4x2_WITHOUT_ADD \r
+ LOAD4x2_2O 32, 64 \r
+ mtctr T8 \r
+ bl CGEMM_L4x2_K32 \r
+ b CGEMM_L4x2_SAVE \r
+ CMP4x2_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L4x2_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-32 \r
+ LOAD4x2_2O 32,64\r
+ bl CGEMM_L4x2_K32 \r
+ b CGEMM_L4x2_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x2_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L4x2_SUB2_8\r
+ bl CGEMM_4x2_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x2_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L4x2_SUB2_4\r
+ bl CGEMM_4x2_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x2_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L4x2_SUB2_2\r
+ LOAD4x2_2\r
+ KERNEL4x2_L2 32,64, 0,0\r
+ KERNEL4x2_E2 32,64, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x2_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L4x2_SUB2_1\r
+ LOAD4x2_2\r
+ KERNEL4x2_E2 32,64, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x2_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L4x2_SAVE \r
+ KERNEL4x2\r
+\r
+ MY_ALIGN\r
+CGEMM_L4x2_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE4x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4\r
+#endif \r
+\r
+\r
+CGEMM_L4x2_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L4x1_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 1\r
+ ble CGEMM_L4x1_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,4\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO4x1\r
+ ble CGEMM_L4x1_SUB0 \r
+ bl CGEMM_4x1_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L4x1_SAVE\r
+ b CGEMM_L4x1_SUB2\r
+\r
+\r
+CGEMM_L4x1_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP4x1_32K\r
+ addi BO,BO,-32\r
+ addi AO,AO,-8 \r
+ LOAD4x1O 8,32 \r
+ END4x1_WITHOUT_ADD \r
+ LOAD4x1_2O 16, 64 \r
+ mtctr T8 \r
+ bl CGEMM_L4x1_K32 \r
+ b CGEMM_L4x1_SAVE \r
+ CMP4x1_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L4x1_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-64\r
+ addi AO,AO,-16 \r
+ LOAD4x1_2O 16,64\r
+ bl CGEMM_L4x1_K32 \r
+ b CGEMM_L4x1_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x1_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L4x1_SUB2_8\r
+ bl CGEMM_4x1_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x1_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L4x1_SUB2_4\r
+ bl CGEMM_4x1_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x1_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L4x1_SUB2_2\r
+ LOAD4x1_2\r
+ KERNEL4x1_L2 16,64, 0,0\r
+ KERNEL4x1_E2 16,64, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L4x1_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L4x1_SUB2_1\r
+ LOAD4x1_2\r
+ KERNEL4x1_E2 16,64, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L4x1_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L4x1_SAVE \r
+ KERNEL4x1\r
+\r
+ MY_ALIGN\r
+CGEMM_L4x1_SAVE:\r
+/*----------------------------------------*/ \r
+ \r
+ SAVE4x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4\r
+#endif \r
+\r
+\r
+CGEMM_L4x1_END:\r
+/*----------------------------------------*/ \r
+ slwi T1, K, 5\r
+ addic. J, J, -1\r
+ add B, B, T1\r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ addi TEMP_REG, TEMP_REG, 4\r
+#endif \r
+ bgt CGEMM_L4_BEGIN\r
+\r
+\r
+CGEMM_L4_END:\r
+\r
+b CGEMM_L2\r
+/* MINI SUBROUTINES */ \r
+/* 2x8 MAIN 128x+2 LOOP */ \r
+\r
+\r
+CGEMM_L2x8_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x8_2 \r
+ MY_ALIGN\r
+CGEMM_L2x8_LOOP:\r
+/*----------------------------------------*/ \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 128,32,0,0 \r
+CGEMM_L2x8_K128:\r
+/*----------------------------------------*/ \r
+ KERNEL2x8_L2 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 128,32,2,0\r
+ KERNEL2x8_L2 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 128,32,4,0\r
+ KERNEL2x8_L2 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 128,32,6,0\r
+ KERNEL2x8_L2 128,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L2 128,32,8,0\r
+ KERNEL2x8_L2 128,32,9,0\r
+ KERNEL2x8_L2 128,32,10,0\r
+ KERNEL2x8_L2 128,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L2 128,32,12,0\r
+ KERNEL2x8_L2 128,32,13,0\r
+ KERNEL2x8_L2 128,32,14,0\r
+ KERNEL2x8_L2 128,32,15,0 \r
+ KERNEL2x8_L2 128,32,16,0\r
+ KERNEL2x8_L2 128,32,17,0 \r
+ KERNEL2x8_L2 128,32,18,0\r
+ KERNEL2x8_L2 128,32,19,0 \r
+ KERNEL2x8_L2 128,32,20,0\r
+ KERNEL2x8_L2 128,32,21,0 \r
+ KERNEL2x8_L2 128,32,22,0\r
+ KERNEL2x8_L2 128,32,23,0 \r
+ KERNEL2x8_L2 128,32,24,0\r
+ KERNEL2x8_L2 128,32,25,0\r
+ KERNEL2x8_L2 128,32,26,0\r
+ KERNEL2x8_L2 128,32,27,0 \r
+ KERNEL2x8_L2 128,32,28,0\r
+ KERNEL2x8_L2 128,32,29,0\r
+ KERNEL2x8_L2 128,32,30,0\r
+ KERNEL2x8_L2 128,32,31,0 \r
+ KERNEL2x8_L2 128,32,32,0\r
+ KERNEL2x8_L2 128,32,33,0\r
+ KERNEL2x8_L2 128,32,34,0\r
+ KERNEL2x8_L2 128,32,35,0 \r
+ KERNEL2x8_L2 128,32,36,0\r
+ KERNEL2x8_L2 128,32,37,0\r
+ KERNEL2x8_L2 128,32,38,0\r
+ KERNEL2x8_L2 128,32,39,0 \r
+ KERNEL2x8_L2 128,32,40,0\r
+ KERNEL2x8_L2 128,32,41,0\r
+ KERNEL2x8_L2 128,32,42,0\r
+ KERNEL2x8_L2 128,32,43,0 \r
+ KERNEL2x8_L2 128,32,44,0\r
+ KERNEL2x8_L2 128,32,45,0\r
+ KERNEL2x8_L2 128,32,46,0\r
+ KERNEL2x8_L2 128,32,47,0 \r
+ KERNEL2x8_L2 128,32,48,0\r
+ KERNEL2x8_L2 128,32,49,0 \r
+ KERNEL2x8_L2 128,32,50,0\r
+ KERNEL2x8_L2 128,32,51,0 \r
+ KERNEL2x8_L2 128,32,52,0\r
+ KERNEL2x8_L2 128,32,53,0 \r
+ KERNEL2x8_L2 128,32,54,0\r
+ KERNEL2x8_L2 128,32,55,0 \r
+ KERNEL2x8_L2 128,32,56,0\r
+ KERNEL2x8_L2 128,32,57,0\r
+ KERNEL2x8_L2 128,32,58,0\r
+ KERNEL2x8_L2 128,32,59,0 \r
+ KERNEL2x8_L2 128,32,60,0\r
+ KERNEL2x8_L2 128,32,61,0\r
+ KERNEL2x8_L2 128,32,62,0 \r
+ KERNEL2x8_L2 128,32,63,1 \r
+ bdnz CGEMM_L2x8_LOOP\r
+ MY_ALIGN \r
+CGEMM_L2x8_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END2x8_2\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x8_L64_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 128,32,0,0 \r
+ KERNEL2x8_L2 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 128,32,2,0\r
+ KERNEL2x8_L2 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 128,32,4,0\r
+ KERNEL2x8_L2 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 128,32,6,0\r
+ KERNEL2x8_L2 128,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L2 128,32,8,0\r
+ KERNEL2x8_L2 128,32,9,0\r
+ KERNEL2x8_L2 128,32,10,0\r
+ KERNEL2x8_L2 128,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L2 128,32,12,0\r
+ KERNEL2x8_L2 128,32,13,0\r
+ KERNEL2x8_L2 128,32,14,0\r
+ KERNEL2x8_L2 128,32,15,0 \r
+ KERNEL2x8_L2 128,32,16,0\r
+ KERNEL2x8_L2 128,32,17,0 \r
+ KERNEL2x8_L2 128,32,18,0\r
+ KERNEL2x8_L2 128,32,19,0 \r
+ KERNEL2x8_L2 128,32,20,0\r
+ KERNEL2x8_L2 128,32,21,0 \r
+ KERNEL2x8_L2 128,32,22,0\r
+ KERNEL2x8_L2 128,32,23,0 \r
+ KERNEL2x8_L2 128,32,24,0\r
+ KERNEL2x8_L2 128,32,25,0\r
+ KERNEL2x8_L2 128,32,26,0\r
+ KERNEL2x8_L2 128,32,27,0 \r
+ KERNEL2x8_L2 128,32,28,0\r
+ KERNEL2x8_L2 128,32,29,0\r
+ KERNEL2x8_L2 128,32,30,0\r
+ KERNEL2x8_E2 128,32,31,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x8_L32_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 128,32,0,0 \r
+ KERNEL2x8_L2 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 128,32,2,0\r
+ KERNEL2x8_L2 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 128,32,4,0\r
+ KERNEL2x8_L2 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 128,32,6,0\r
+ KERNEL2x8_L2 128,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L2 128,32,8,0\r
+ KERNEL2x8_L2 128,32,9,0\r
+ KERNEL2x8_L2 128,32,10,0\r
+ KERNEL2x8_L2 128,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L2 128,32,12,0\r
+ KERNEL2x8_L2 128,32,13,0\r
+ KERNEL2x8_L2 128,32,14,0\r
+ KERNEL2x8_E2 128,32,15,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x8_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L2 128,32,0,0 \r
+ KERNEL2x8_L2 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L2 128,32,2,0\r
+ KERNEL2x8_L2 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L2 128,32,4,0\r
+ KERNEL2x8_L2 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L2 128,32,6,0\r
+ KERNEL2x8_E2 128,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x4_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x4_2 \r
+ MY_ALIGN\r
+CGEMM_L2x4_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL2x4_L2 64,32,0,0\r
+CGEMM_L2x4_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL2x4_L2 64,32,1,0 \r
+ KERNEL2x4_L2 64,32,2,0\r
+ KERNEL2x4_L2 64,32,3,0 \r
+ KERNEL2x4_L2 64,32,4,0\r
+ KERNEL2x4_L2 64,32,5,0 \r
+ KERNEL2x4_L2 64,32,6,0\r
+ KERNEL2x4_L2 64,32,7,0\r
+ KERNEL2x4_L2 64,32,8,0\r
+ KERNEL2x4_L2 64,32,9,0 \r
+ KERNEL2x4_L2 64,32,10,0\r
+ KERNEL2x4_L2 64,32,11,0 \r
+ KERNEL2x4_L2 64,32,12,0\r
+ KERNEL2x4_L2 64,32,13,0 \r
+ KERNEL2x4_L2 64,32,14,0\r
+ KERNEL2x4_L2 64,32,15,1 \r
+ bdnz CGEMM_L2x4_LOOP\r
+ MY_ALIGN \r
+CGEMM_L2x4_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END2x4_2 \r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x4_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x4_2\r
+ KERNEL2x4_L2 64,32,0,0\r
+ KERNEL2x4_L2 64,32,1,0 \r
+ KERNEL2x4_L2 64,32,2,0\r
+ KERNEL2x4_L2 64,32,3,0 \r
+ KERNEL2x4_L2 64,32,4,0\r
+ KERNEL2x4_L2 64,32,5,0 \r
+ KERNEL2x4_L2 64,32,6,0\r
+ KERNEL2x4_E2 64,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x4_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x4_2\r
+ KERNEL2x4_L2 64,32,0,0\r
+ KERNEL2x4_L2 64,32,1,0 \r
+ KERNEL2x4_L2 64,32,2,0\r
+ KERNEL2x4_E2 64,32,3,1 \r
+ blr\r
+\r
+\r
+CGEMM_2x2_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x2_2 \r
+ MY_ALIGN \r
+CGEMM_L2x2_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL2x2_L2 32,32,0,0 \r
+CGEMM_L2x2_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL2x2_L2 32,32,1,0 \r
+ KERNEL2x2_L2 32,32,2,0\r
+ KERNEL2x2_L2 32,32,3,0 \r
+ KERNEL2x2_L2 32,32,4,0\r
+ KERNEL2x2_L2 32,32,5,0 \r
+ KERNEL2x2_L2 32,32,6,0\r
+ KERNEL2x2_L2 32,32,7,0\r
+ KERNEL2x2_L2 32,32,8,0\r
+ KERNEL2x2_L2 32,32,9,0 \r
+ KERNEL2x2_L2 32,32,10,0\r
+ KERNEL2x2_L2 32,32,11,0 \r
+ KERNEL2x2_L2 32,32,12,0\r
+ KERNEL2x2_L2 32,32,13,0 \r
+ KERNEL2x2_L2 32,32,14,0\r
+ KERNEL2x2_L2 32,32,15,1 \r
+ bdnz CGEMM_L2x2_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x2_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END2x2_2 \r
+ blr\r
+ MY_ALIGN\r
+CGEMM_2x2_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x2_2\r
+ KERNEL2x2_L2 32,32,0,0\r
+ KERNEL2x2_L2 32,32,1,0 \r
+ KERNEL2x2_L2 32,32,2,0\r
+ KERNEL2x2_L2 32,32,3,0 \r
+ KERNEL2x2_L2 32,32,4,0\r
+ KERNEL2x2_L2 32,32,5,0 \r
+ KERNEL2x2_L2 32,32,6,0\r
+ KERNEL2x2_E2 32,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+CGEMM_2x2_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x2_2\r
+ KERNEL2x2_L2 32,32,0,0\r
+ KERNEL2x2_L2 32,32,1,0 \r
+ KERNEL2x2_L2 32,32,2,0\r
+ KERNEL2x2_E2 32,32,3,1 \r
+ blr\r
+\r
+\r
+CGEMM_2x1_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD2x1_2 \r
+ MY_ALIGN\r
+CGEMM_L2x1_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL2x1_L2 16,32,0,0 \r
+CGEMM_L2x1_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL2x1_L2 16,32,1,0 \r
+ KERNEL2x1_L2 16,32,2,0\r
+ KERNEL2x1_L2 16,32,3,0 \r
+ KERNEL2x1_L2 16,32,4,0\r
+ KERNEL2x1_L2 16,32,5,0 \r
+ KERNEL2x1_L2 16,32,6,0\r
+ KERNEL2x1_L2 16,32,7,0\r
+ KERNEL2x1_L2 16,32,8,0\r
+ KERNEL2x1_L2 16,32,9,0 \r
+ KERNEL2x1_L2 16,32,10,0\r
+ KERNEL2x1_L2 16,32,11,0 \r
+ KERNEL2x1_L2 16,32,12,0\r
+ KERNEL2x1_L2 16,32,13,0 \r
+ KERNEL2x1_L2 16,32,14,0\r
+ KERNEL2x1_L2 16,32,15,1 \r
+ bdnz CGEMM_L2x1_LOOP\r
+ MY_ALIGN \r
+CGEMM_L2x1_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END2x1_2 \r
+ blr\r
+\r
+ MY_ALIGN\r
+CGEMM_2x1_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x1_2\r
+ KERNEL2x1_L2 16,32,0,0\r
+ KERNEL2x1_L2 16,32,1,0 \r
+ KERNEL2x1_L2 16,32,2,0\r
+ KERNEL2x1_L2 16,32,3,0 \r
+ KERNEL2x1_L2 16,32,4,0\r
+ KERNEL2x1_L2 16,32,5,0 \r
+ KERNEL2x1_L2 16,32,6,0\r
+ KERNEL2x1_E2 16,32,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_2x1_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD2x1_2\r
+ KERNEL2x1_L2 16,32,0,0\r
+ KERNEL2x1_L2 16,32,1,0 \r
+ KERNEL2x1_L2 16,32,2,0\r
+ KERNEL2x1_E2 16,32,3,1 \r
+ blr\r
+\r
+\r
+\r
+/* MAIN LOOP BEGINS */ \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2:\r
+/*----------------------------------------*/ \r
+\r
+ andi. J, N, 2\r
+ ble CGEMM_L2_END\r
+\r
+\r
+CGEMM_L2_BEGIN:\r
+/*----------------------------------------*/ \r
+ mr CO, C\r
+ slwi T1, LDC , 1 \r
+ add T2,C,LDC \r
+ mr AO, A \r
+ add C, C, T1\r
+#if defined(TRMMKERNEL) && defined(LEFT) \r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ srawi. I, M, 3\r
+ ble CGEMM_L2x8_END\r
+ dcbt CO,r0 /*just prefetch*/\r
+ dcbt T2,r0 \r
+\r
+\r
+CGEMM_L2x8_BEGIN:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2\r
+#else \r
+ mr BO, B \r
+ dcbt B, r0 \r
+#endif \r
+ dcbt AO, r0\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,2\r
+ mr T1, T6\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(T1-2) % 128x */\r
+#else \r
+ mr T1, K\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(K-2) % 128x */\r
+#endif \r
+ ZERO2x8 \r
+ ble CGEMM_L2x8_SUB0\r
+ bl CGEMM_L2x8_LMAIN_SUB\r
+ andi. L, T1, 127\r
+ ble CGEMM_L2x8_SAVE\r
+ b CGEMM_L2x8_SUB2\r
+\r
+\r
+CGEMM_L2x8_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 255\r
+ cmpwi T6,129\r
+#else \r
+ andi. L, K, 255\r
+ cmpwi K,129\r
+#endif \r
+ li T8,1\r
+ bne CMP2x8_128K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-64 \r
+ LOAD2x8O 64,16 \r
+ END2x8_WITHOUT_ADD \r
+ LOAD2x8_2O 128, 32 \r
+ mtctr T8 \r
+ bl CGEMM_L2x8_K128 \r
+ b CGEMM_L2x8_SAVE \r
+ CMP2x8_128K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,128\r
+#else \r
+ cmpwi K,128\r
+#endif \r
+ bne CGEMM_L2x8_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-128 \r
+ LOAD2x8_2O 128,32\r
+ bl CGEMM_L2x8_K128 \r
+ b CGEMM_L2x8_SAVE \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x8_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 64\r
+ ble CGEMM_L2x8_SUB2_32\r
+ bl CGEMM_2x8_L64_SUB\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x8_SUB2_32:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 32\r
+ ble CGEMM_L2x8_SUB2_16 \r
+ bl CGEMM_2x8_L32_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x8_SUB2_16:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L2x8_SUB2_8\r
+ bl CGEMM_2x8_L16_SUB \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x8_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L2x8_SUB2_4\r
+ LOAD2x8_2\r
+ KERNEL2x8_L2 128,32, 0,0\r
+ KERNEL2x8_L2 128,32, 1,0\r
+ KERNEL2x8_L2 128,32, 2,0\r
+ KERNEL2x8_E2 128,32, 3,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x8_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L2x8_SUB2_2\r
+ LOAD2x8_2\r
+ KERNEL2x8_L2 128,32, 0,0\r
+ KERNEL2x8_E2 128,32, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x8_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L2x8_SUB2_1\r
+ LOAD2x8_2 \r
+ KERNEL2x8_E2 128,32, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x8_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L2x8_SAVE \r
+ KERNEL2x8\r
+\r
+ MY_ALIGN\r
+CGEMM_L2x8_SAVE:\r
+/*----------------------------------------*/ \r
+ addic. I, I, -1\r
+ MY_ALIGN\r
+ SAVE2x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2\r
+#endif \r
+ bgt CGEMM_L2x8_BEGIN\r
+ andi. T2, M, 7\r
+ ble CGEMM_L2x1_END\r
+ andi. T1, M, 4\r
+ ble CGEMM_L2x4_END\r
+ b CGEMM_L2x4_BEGIN\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x8_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L2x4_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T2, M, 7\r
+ ble CGEMM_L2x1_END\r
+ andi. T1, M, 4\r
+ ble CGEMM_L2x4_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,2\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO2x4\r
+ ble CGEMM_L2x4_SUB0 \r
+ bl CGEMM_2x4_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L2x4_SAVE\r
+ b CGEMM_L2x4_SUB2\r
+\r
+\r
+CGEMM_L2x4_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP2x4_32K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-32 \r
+ LOAD2x4O 32,16 \r
+ END2x4_WITHOUT_ADD \r
+ LOAD2x4_2O 64, 32 \r
+ mtctr T8 \r
+ bl CGEMM_L2x4_K32 \r
+ b CGEMM_L2x4_SAVE \r
+ CMP2x4_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L2x4_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-64 \r
+ LOAD2x4_2O 64,32\r
+ bl CGEMM_L2x4_K32 \r
+ b CGEMM_L2x4_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x4_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L2x4_SUB2_8\r
+ bl CGEMM_2x4_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x4_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L2x4_SUB2_4\r
+ bl CGEMM_2x4_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x4_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L2x4_SUB2_2\r
+ LOAD2x4_2\r
+ KERNEL2x4_L2 64,32, 0,0\r
+ KERNEL2x4_E2 64,32, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x4_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L2x4_SUB2_1\r
+ LOAD2x4_2\r
+ KERNEL2x4_E2 64,32, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x4_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L2x4_SAVE \r
+ KERNEL2x4\r
+\r
+\r
+CGEMM_L2x4_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE2x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2\r
+#endif \r
+\r
+\r
+CGEMM_L2x4_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L2x2_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 2\r
+ ble CGEMM_L2x2_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,2\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO2x2\r
+ ble CGEMM_L2x2_SUB0 \r
+ bl CGEMM_2x2_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L2x2_SAVE\r
+ b CGEMM_L2x2_SUB2\r
+\r
+\r
+CGEMM_L2x2_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP2x2_32K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-16 \r
+ LOAD2x2O 16,16 \r
+ END2x2_WITHOUT_ADD \r
+ LOAD2x2_2O 32, 32 \r
+ mtctr T8 \r
+ bl CGEMM_L2x2_K32 \r
+ b CGEMM_L2x2_SAVE \r
+ CMP2x2_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L2x2_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-32 \r
+ LOAD2x2_2O 32,32\r
+ bl CGEMM_L2x2_K32 \r
+ b CGEMM_L2x2_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x2_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L2x2_SUB2_8\r
+ bl CGEMM_2x2_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x2_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L2x2_SUB2_4\r
+ bl CGEMM_2x2_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x2_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L2x2_SUB2_2\r
+ LOAD2x2_2\r
+ KERNEL2x2_L2 32,32, 0,0\r
+ KERNEL2x2_E2 32,32, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x2_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L2x2_SUB2_1\r
+ LOAD2x2_2\r
+ KERNEL2x2_E2 32,32, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x2_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L2x2_SAVE \r
+ KERNEL2x2\r
+\r
+ MY_ALIGN\r
+CGEMM_L2x2_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE2x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2\r
+#endif \r
+\r
+\r
+CGEMM_L2x2_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L2x1_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 1\r
+ ble CGEMM_L2x1_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,2\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 32x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 32x */\r
+#endif \r
+ ZERO2x1\r
+ ble CGEMM_L2x1_SUB0 \r
+ bl CGEMM_2x1_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L2x1_SAVE\r
+ b CGEMM_L2x1_SUB2\r
+\r
+\r
+CGEMM_L2x1_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP2x1_32K\r
+ addi BO,BO,-16\r
+ addi AO,AO,-8 \r
+ LOAD2x1O 8,16 \r
+ END2x1_WITHOUT_ADD \r
+ LOAD2x1_2O 16, 32 \r
+ mtctr T8 \r
+ bl CGEMM_L2x1_K32 \r
+ b CGEMM_L2x1_SAVE \r
+ CMP2x1_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L2x1_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-32\r
+ addi AO,AO,-16 \r
+ LOAD2x1_2O 16,32\r
+ bl CGEMM_L2x1_K32 \r
+ b CGEMM_L2x1_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x1_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L2x1_SUB2_8\r
+ bl CGEMM_2x1_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x1_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L2x1_SUB2_4\r
+ bl CGEMM_2x1_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x1_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L2x1_SUB2_2\r
+ LOAD2x1_2\r
+ KERNEL2x1_L2 16,32, 0,0\r
+ KERNEL2x1_E2 16,32, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L2x1_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L2x1_SUB2_1\r
+ LOAD2x1_2\r
+ KERNEL2x1_E2 16,32, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L2x1_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L2x1_SAVE \r
+ KERNEL2x1\r
+\r
+ MY_ALIGN\r
+CGEMM_L2x1_SAVE:\r
+/*----------------------------------------*/ \r
+ \r
+ SAVE2x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2\r
+#endif \r
+\r
+\r
+CGEMM_L2x1_END:\r
+/*----------------------------------------*/ \r
+ slwi T1, K, 4\r
+\r
+ add B, B, T1\r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ addi TEMP_REG, TEMP_REG, 2\r
+#endif \r
+\r
+CGEMM_L2_END:\r
+\r
+\r
+b CGEMM_L1\r
+/* MINI SUBROUTINES */ \r
+/* 1x8 MAIN 128x+2 LOOP */ \r
+\r
+\r
+CGEMM_L1x8_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x8_2 \r
+ MY_ALIGN\r
+CGEMM_L1x8_LOOP:\r
+/*----------------------------------------*/ \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 128,16,0,0 \r
+CGEMM_L1x8_K128:\r
+/*----------------------------------------*/ \r
+ KERNEL1x8_L2 128,16,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 128,16,2,0\r
+ KERNEL1x8_L2 128,16,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 128,16,4,0\r
+ KERNEL1x8_L2 128,16,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 128,16,6,0\r
+ KERNEL1x8_L2 128,16,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L2 128,16,8,0\r
+ KERNEL1x8_L2 128,16,9,0\r
+ KERNEL1x8_L2 128,16,10,0\r
+ KERNEL1x8_L2 128,16,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L2 128,16,12,0\r
+ KERNEL1x8_L2 128,16,13,0\r
+ KERNEL1x8_L2 128,16,14,0\r
+ KERNEL1x8_L2 128,16,15,0 \r
+ KERNEL1x8_L2 128,16,16,0\r
+ KERNEL1x8_L2 128,16,17,0 \r
+ KERNEL1x8_L2 128,16,18,0\r
+ KERNEL1x8_L2 128,16,19,0 \r
+ KERNEL1x8_L2 128,16,20,0\r
+ KERNEL1x8_L2 128,16,21,0 \r
+ KERNEL1x8_L2 128,16,22,0\r
+ KERNEL1x8_L2 128,16,23,0 \r
+ KERNEL1x8_L2 128,16,24,0\r
+ KERNEL1x8_L2 128,16,25,0\r
+ KERNEL1x8_L2 128,16,26,0\r
+ KERNEL1x8_L2 128,16,27,0 \r
+ KERNEL1x8_L2 128,16,28,0\r
+ KERNEL1x8_L2 128,16,29,0\r
+ KERNEL1x8_L2 128,16,30,0\r
+ KERNEL1x8_L2 128,16,31,0 \r
+ KERNEL1x8_L2 128,16,32,0\r
+ KERNEL1x8_L2 128,16,33,0\r
+ KERNEL1x8_L2 128,16,34,0\r
+ KERNEL1x8_L2 128,16,35,0 \r
+ KERNEL1x8_L2 128,16,36,0\r
+ KERNEL1x8_L2 128,16,37,0\r
+ KERNEL1x8_L2 128,16,38,0\r
+ KERNEL1x8_L2 128,16,39,0 \r
+ KERNEL1x8_L2 128,16,40,0\r
+ KERNEL1x8_L2 128,16,41,0\r
+ KERNEL1x8_L2 128,16,42,0\r
+ KERNEL1x8_L2 128,16,43,0 \r
+ KERNEL1x8_L2 128,16,44,0\r
+ KERNEL1x8_L2 128,16,45,0\r
+ KERNEL1x8_L2 128,16,46,0\r
+ KERNEL1x8_L2 128,16,47,0 \r
+ KERNEL1x8_L2 128,16,48,0\r
+ KERNEL1x8_L2 128,16,49,0 \r
+ KERNEL1x8_L2 128,16,50,0\r
+ KERNEL1x8_L2 128,16,51,0 \r
+ KERNEL1x8_L2 128,16,52,0\r
+ KERNEL1x8_L2 128,16,53,0 \r
+ KERNEL1x8_L2 128,16,54,0\r
+ KERNEL1x8_L2 128,16,55,0 \r
+ KERNEL1x8_L2 128,16,56,0\r
+ KERNEL1x8_L2 128,16,57,0\r
+ KERNEL1x8_L2 128,16,58,0\r
+ KERNEL1x8_L2 128,16,59,0 \r
+ KERNEL1x8_L2 128,16,60,0\r
+ KERNEL1x8_L2 128,16,61,0\r
+ KERNEL1x8_L2 128,16,62,0 \r
+ KERNEL1x8_L2 128,16,63,1 \r
+ bdnz CGEMM_L1x8_LOOP\r
+ MY_ALIGN \r
+CGEMM_L1x8_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x8_2\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x8_L64_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 128,16,0,0 \r
+ KERNEL1x8_L2 128,16,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 128,16,2,0\r
+ KERNEL1x8_L2 128,16,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 128,16,4,0\r
+ KERNEL1x8_L2 128,16,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 128,16,6,0\r
+ KERNEL1x8_L2 128,16,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L2 128,16,8,0\r
+ KERNEL1x8_L2 128,16,9,0\r
+ KERNEL1x8_L2 128,16,10,0\r
+ KERNEL1x8_L2 128,16,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L2 128,16,12,0\r
+ KERNEL1x8_L2 128,16,13,0\r
+ KERNEL1x8_L2 128,16,14,0\r
+ KERNEL1x8_L2 128,16,15,0 \r
+ KERNEL1x8_L2 128,16,16,0\r
+ KERNEL1x8_L2 128,16,17,0 \r
+ KERNEL1x8_L2 128,16,18,0\r
+ KERNEL1x8_L2 128,16,19,0 \r
+ KERNEL1x8_L2 128,16,20,0\r
+ KERNEL1x8_L2 128,16,21,0 \r
+ KERNEL1x8_L2 128,16,22,0\r
+ KERNEL1x8_L2 128,16,23,0 \r
+ KERNEL1x8_L2 128,16,24,0\r
+ KERNEL1x8_L2 128,16,25,0\r
+ KERNEL1x8_L2 128,16,26,0\r
+ KERNEL1x8_L2 128,16,27,0 \r
+ KERNEL1x8_L2 128,16,28,0\r
+ KERNEL1x8_L2 128,16,29,0\r
+ KERNEL1x8_L2 128,16,30,0\r
+ KERNEL1x8_E2 128,16,31,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x8_L32_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 128,16,0,0 \r
+ KERNEL1x8_L2 128,16,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 128,16,2,0\r
+ KERNEL1x8_L2 128,16,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 128,16,4,0\r
+ KERNEL1x8_L2 128,16,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 128,16,6,0\r
+ KERNEL1x8_L2 128,16,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L2 128,16,8,0\r
+ KERNEL1x8_L2 128,16,9,0\r
+ KERNEL1x8_L2 128,16,10,0\r
+ KERNEL1x8_L2 128,16,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L2 128,16,12,0\r
+ KERNEL1x8_L2 128,16,13,0\r
+ KERNEL1x8_L2 128,16,14,0\r
+ KERNEL1x8_E2 128,16,15,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x8_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x8_2 \r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L2 128,16,0,0 \r
+ KERNEL1x8_L2 128,16,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L2 128,16,2,0\r
+ KERNEL1x8_L2 128,16,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L2 128,16,4,0\r
+ KERNEL1x8_L2 128,16,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L2 128,16,6,0\r
+ KERNEL1x8_E2 128,16,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x4_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x4_2 \r
+ MY_ALIGN\r
+CGEMM_L1x4_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL1x4_L2 64,16,0,0\r
+CGEMM_L1x4_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL1x4_L2 64,16,1,0 \r
+ KERNEL1x4_L2 64,16,2,0\r
+ KERNEL1x4_L2 64,16,3,0 \r
+ KERNEL1x4_L2 64,16,4,0\r
+ KERNEL1x4_L2 64,16,5,0 \r
+ KERNEL1x4_L2 64,16,6,0\r
+ KERNEL1x4_L2 64,16,7,0\r
+ KERNEL1x4_L2 64,16,8,0\r
+ KERNEL1x4_L2 64,16,9,0 \r
+ KERNEL1x4_L2 64,16,10,0\r
+ KERNEL1x4_L2 64,16,11,0 \r
+ KERNEL1x4_L2 64,16,12,0\r
+ KERNEL1x4_L2 64,16,13,0 \r
+ KERNEL1x4_L2 64,16,14,0\r
+ KERNEL1x4_L2 64,16,15,1 \r
+ bdnz CGEMM_L1x4_LOOP\r
+ MY_ALIGN \r
+CGEMM_L1x4_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x4_2 \r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x4_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x4_2\r
+ KERNEL1x4_L2 64,16,0,0\r
+ KERNEL1x4_L2 64,16,1,0 \r
+ KERNEL1x4_L2 64,16,2,0\r
+ KERNEL1x4_L2 64,16,3,0 \r
+ KERNEL1x4_L2 64,16,4,0\r
+ KERNEL1x4_L2 64,16,5,0 \r
+ KERNEL1x4_L2 64,16,6,0\r
+ KERNEL1x4_E2 64,16,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x4_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x4_2\r
+ KERNEL1x4_L2 64,16,0,0\r
+ KERNEL1x4_L2 64,16,1,0 \r
+ KERNEL1x4_L2 64,16,2,0\r
+ KERNEL1x4_E2 64,16,3,1 \r
+ blr\r
+\r
+\r
+CGEMM_1x2_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x2_2 \r
+ MY_ALIGN \r
+CGEMM_L1x2_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL1x2_L2 32,16,0,0 \r
+CGEMM_L1x2_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL1x2_L2 32,16,1,0 \r
+ KERNEL1x2_L2 32,16,2,0\r
+ KERNEL1x2_L2 32,16,3,0 \r
+ KERNEL1x2_L2 32,16,4,0\r
+ KERNEL1x2_L2 32,16,5,0 \r
+ KERNEL1x2_L2 32,16,6,0\r
+ KERNEL1x2_L2 32,16,7,0\r
+ KERNEL1x2_L2 32,16,8,0\r
+ KERNEL1x2_L2 32,16,9,0 \r
+ KERNEL1x2_L2 32,16,10,0\r
+ KERNEL1x2_L2 32,16,11,0 \r
+ KERNEL1x2_L2 32,16,12,0\r
+ KERNEL1x2_L2 32,16,13,0 \r
+ KERNEL1x2_L2 32,16,14,0\r
+ KERNEL1x2_L2 32,16,15,1 \r
+ bdnz CGEMM_L1x2_LOOP\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x2_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x2_2 \r
+ blr\r
+ MY_ALIGN\r
+CGEMM_1x2_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x2_2\r
+ KERNEL1x2_L2 32,16,0,0\r
+ KERNEL1x2_L2 32,16,1,0 \r
+ KERNEL1x2_L2 32,16,2,0\r
+ KERNEL1x2_L2 32,16,3,0 \r
+ KERNEL1x2_L2 32,16,4,0\r
+ KERNEL1x2_L2 32,16,5,0 \r
+ KERNEL1x2_L2 32,16,6,0\r
+ KERNEL1x2_E2 32,16,7,1\r
+ blr\r
+ MY_ALIGN\r
+CGEMM_1x2_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x2_2\r
+ KERNEL1x2_L2 32,16,0,0\r
+ KERNEL1x2_L2 32,16,1,0 \r
+ KERNEL1x2_L2 32,16,2,0\r
+ KERNEL1x2_E2 32,16,3,1 \r
+ blr\r
+\r
+\r
+CGEMM_1x1_LMAIN_SUB:\r
+/*----------------------------------------*/ \r
+ mtctr T8\r
+ LOAD1x1_2 \r
+ MY_ALIGN\r
+CGEMM_L1x1_LOOP:\r
+/*----------------------------------------*/ \r
+ KERNEL1x1_L2 16,16,0,0 \r
+CGEMM_L1x1_K32:\r
+/*----------------------------------------*/ \r
+ KERNEL1x1_L2 16,16,1,0 \r
+ KERNEL1x1_L2 16,16,2,0\r
+ KERNEL1x1_L2 16,16,3,0 \r
+ KERNEL1x1_L2 16,16,4,0\r
+ KERNEL1x1_L2 16,16,5,0 \r
+ KERNEL1x1_L2 16,16,6,0\r
+ KERNEL1x1_L2 16,16,7,0\r
+ KERNEL1x1_L2 16,16,8,0\r
+ KERNEL1x1_L2 16,16,9,0 \r
+ KERNEL1x1_L2 16,16,10,0\r
+ KERNEL1x1_L2 16,16,11,0 \r
+ KERNEL1x1_L2 16,16,12,0\r
+ KERNEL1x1_L2 16,16,13,0 \r
+ KERNEL1x1_L2 16,16,14,0\r
+ KERNEL1x1_L2 16,16,15,1 \r
+ bdnz CGEMM_L1x1_LOOP\r
+ MY_ALIGN \r
+CGEMM_L1x1_LOOP_END:\r
+/*----------------------------------------*/ \r
+ END1x1_2 \r
+ blr\r
+\r
+ MY_ALIGN\r
+CGEMM_1x1_L16_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x1_2\r
+ KERNEL1x1_L2 16,16,0,0\r
+ KERNEL1x1_L2 16,16,1,0 \r
+ KERNEL1x1_L2 16,16,2,0\r
+ KERNEL1x1_L2 16,16,3,0 \r
+ KERNEL1x1_L2 16,16,4,0\r
+ KERNEL1x1_L2 16,16,5,0 \r
+ KERNEL1x1_L2 16,16,6,0\r
+ KERNEL1x1_E2 16,16,7,1\r
+ blr\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_1x1_L8_SUB:\r
+/*----------------------------------------*/ \r
+ LOAD1x1_2\r
+ KERNEL1x1_L2 16,16,0,0\r
+ KERNEL1x1_L2 16,16,1,0 \r
+ KERNEL1x1_L2 16,16,2,0\r
+ KERNEL1x1_E2 16,16,3,1 \r
+ blr\r
+\r
+\r
+\r
+/* MAIN LOOP BEGINS */ \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1:\r
+/*----------------------------------------*/ \r
+\r
+ andi. J, N, 1\r
+ ble CGEMM_L1_END\r
+\r
+CGEMM_L1_BEGIN:\r
+/*----------------------------------------*/ \r
+ mr CO, C \r
+ add T2,C,LDC \r
+ mr AO, A \r
+ add C, C, T1\r
+#if defined(TRMMKERNEL) && defined(LEFT) \r
+ mr TEMP_REG, OFFSET /*off = offset;*/\r
+#endif \r
+ srawi. I, M, 3\r
+ ble CGEMM_L1x8_END\r
+ dcbt CO,r0 /*just prefetch*/\r
+ dcbt T2,r0 \r
+\r
+\r
+CGEMM_L1x8_BEGIN:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1\r
+#else \r
+ mr BO, B \r
+ dcbt B, r0 \r
+#endif \r
+ dcbt AO, r0\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,8,1\r
+ mr T1, T6\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(T1-2) % 128x */\r
+#else \r
+ mr T1, K\r
+/* TEMPS FOR PREFETCH */ \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ addi T1,T1, -2\r
+/* TEMPS FOR PREFETCH */ \r
+ li T4, 2048\r
+ li T5, 2048+512 \r
+ srawi. T8, T1, 7 /**(K-2) % 128x */\r
+#endif \r
+ ZERO1x8 \r
+ ble CGEMM_L1x8_SUB0\r
+ bl CGEMM_L1x8_LMAIN_SUB\r
+ andi. L, T1, 127\r
+ ble CGEMM_L1x8_SAVE\r
+ b CGEMM_L1x8_SUB2\r
+\r
+\r
+CGEMM_L1x8_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 255\r
+ cmpwi T6,129\r
+#else \r
+ andi. L, K, 255\r
+ cmpwi K,129\r
+#endif \r
+ li T8,1\r
+ bne CMP1x8_128K\r
+ addi BO,BO,-8\r
+ addi AO,AO,-64 \r
+ LOAD1x8O 64,8 \r
+ END1x8_WITHOUT_ADD \r
+ LOAD1x8_2O 128, 16 \r
+ mtctr T8 \r
+ bl CGEMM_L1x8_K128 \r
+ b CGEMM_L1x8_SAVE \r
+ CMP1x8_128K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,128\r
+#else \r
+ cmpwi K,128\r
+#endif \r
+ bne CGEMM_L1x8_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-16\r
+ addi AO,AO,-128 \r
+ LOAD1x8_2O 128,16\r
+ bl CGEMM_L1x8_K128 \r
+ b CGEMM_L1x8_SAVE \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x8_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 64\r
+ ble CGEMM_L1x8_SUB2_32\r
+ bl CGEMM_1x8_L64_SUB\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x8_SUB2_32:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 32\r
+ ble CGEMM_L1x8_SUB2_16 \r
+ bl CGEMM_1x8_L32_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x8_SUB2_16:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L1x8_SUB2_8\r
+ bl CGEMM_1x8_L16_SUB \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x8_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L1x8_SUB2_4\r
+ LOAD1x8_2\r
+ KERNEL1x8_L2 128,16, 0,0\r
+ KERNEL1x8_L2 128,16, 1,0\r
+ KERNEL1x8_L2 128,16, 2,0\r
+ KERNEL1x8_E2 128,16, 3,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x8_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L1x8_SUB2_2\r
+ LOAD1x8_2\r
+ KERNEL1x8_L2 128,16, 0,0\r
+ KERNEL1x8_E2 128,16, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x8_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L1x8_SUB2_1\r
+ LOAD1x8_2 \r
+ KERNEL1x8_E2 128,16, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x8_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L1x8_SAVE \r
+ KERNEL1x8\r
+\r
+ MY_ALIGN\r
+CGEMM_L1x8_SAVE:\r
+/*----------------------------------------*/ \r
+ addic. I, I, -1\r
+ MY_ALIGN\r
+ SAVE1x8\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1\r
+#endif \r
+ bgt CGEMM_L1x8_BEGIN\r
+ andi. T2, M, 7\r
+ ble CGEMM_L1x1_END\r
+ andi. T1, M, 4\r
+ ble CGEMM_L1x4_END\r
+ b CGEMM_L1x4_BEGIN\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x8_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L1x4_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T2, M, 7\r
+ ble CGEMM_L1x1_END\r
+ andi. T1, M, 4\r
+ ble CGEMM_L1x4_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,4,1\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 31x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 31x */\r
+#endif \r
+ ZERO1x4\r
+ ble CGEMM_L1x4_SUB0 \r
+ bl CGEMM_1x4_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L1x4_SAVE\r
+ b CGEMM_L1x4_SUB2\r
+\r
+\r
+CGEMM_L1x4_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP1x4_32K\r
+ addi BO,BO,-8\r
+ addi AO,AO,-32 \r
+ LOAD1x4O 32,8 \r
+ END1x4_WITHOUT_ADD \r
+ LOAD1x4_2O 64, 16 \r
+ mtctr T8 \r
+ bl CGEMM_L1x4_K32 \r
+ b CGEMM_L1x4_SAVE \r
+ CMP1x4_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L1x4_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-16\r
+ addi AO,AO,-64 \r
+ LOAD1x4_2O 64,16\r
+ bl CGEMM_L1x4_K32 \r
+ b CGEMM_L1x4_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x4_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L1x4_SUB2_8\r
+ bl CGEMM_1x4_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x4_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L1x4_SUB2_4\r
+ bl CGEMM_1x4_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x4_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L1x4_SUB2_2\r
+ LOAD1x4_2\r
+ KERNEL1x4_L2 64,16, 0,0\r
+ KERNEL1x4_E2 64,16, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x4_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L1x4_SUB2_1\r
+ LOAD1x4_2\r
+ KERNEL1x4_E2 64,16, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x4_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L1x4_SAVE \r
+ KERNEL1x4\r
+\r
+\r
+CGEMM_L1x4_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE1x4\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1\r
+#endif \r
+\r
+\r
+CGEMM_L1x4_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L1x2_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 2\r
+ ble CGEMM_L1x2_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,2,1\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 31x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 31x */\r
+#endif \r
+ ZERO1x2\r
+ ble CGEMM_L1x2_SUB0 \r
+ bl CGEMM_1x2_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L1x2_SAVE\r
+ b CGEMM_L1x2_SUB2\r
+\r
+\r
+CGEMM_L1x2_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP1x2_32K\r
+ addi BO,BO,-8\r
+ addi AO,AO,-16 \r
+ LOAD1x2O 16,8 \r
+ END1x2_WITHOUT_ADD \r
+ LOAD1x2_2O 32, 16 \r
+ mtctr T8 \r
+ bl CGEMM_L1x2_K32 \r
+ b CGEMM_L1x2_SAVE \r
+ CMP1x2_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L1x2_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-16\r
+ addi AO,AO,-32 \r
+ LOAD1x2_2O 32,16\r
+ bl CGEMM_L1x2_K32 \r
+ b CGEMM_L1x2_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x2_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L1x2_SUB2_8\r
+ bl CGEMM_1x2_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x2_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L1x2_SUB2_4\r
+ bl CGEMM_1x2_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x2_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L1x2_SUB2_2\r
+ LOAD1x2_2\r
+ KERNEL1x2_L2 32,16, 0,0\r
+ KERNEL1x2_E2 32,16, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x2_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L1x2_SUB2_1\r
+ LOAD1x2_2\r
+ KERNEL1x2_E2 32,16, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x2_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L1x2_SAVE \r
+ KERNEL1x2\r
+\r
+ MY_ALIGN\r
+CGEMM_L1x2_SAVE:\r
+/*----------------------------------------*/ \r
+ SAVE1x2\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1\r
+#endif \r
+\r
+\r
+CGEMM_L1x2_END:\r
+/*----------------------------------------*/ \r
+\r
+\r
+CGEMM_L1x1_BEGIN:\r
+/*----------------------------------------*/ \r
+ andi. T1, M, 1\r
+ ble CGEMM_L1x1_END\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1\r
+#else \r
+ mr BO, B \r
+#endif \r
+#if defined(TRMMKERNEL) \r
+ REFRESH_TEMP_BK T6,K,TEMP_REG,1,1\r
+ mr T1, T6 \r
+ addi T1,T1, -2 \r
+ srawi. T8, T1, 5 /**(T1-2) % 31x */\r
+#else \r
+ mr T1, K \r
+ addi T1,T1, -2\r
+ srawi. T8, T1, 5 /**(K-2) % 31x */\r
+#endif \r
+ ZERO1x1\r
+ ble CGEMM_L1x1_SUB0 \r
+ bl CGEMM_1x1_LMAIN_SUB\r
+ andi. L, T1, 31\r
+ ble CGEMM_L1x1_SAVE\r
+ b CGEMM_L1x1_SUB2\r
+\r
+\r
+CGEMM_L1x1_SUB0:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ andi. L, T6, 63\r
+ cmpwi T6,33\r
+#else \r
+ andi. L, K, 63\r
+ cmpwi K,33\r
+#endif \r
+ li T8,1\r
+ bne CMP1x1_32K\r
+ addi BO,BO,-8\r
+ addi AO,AO,-8 \r
+ LOAD1x1O 8,8 \r
+ END1x1_WITHOUT_ADD \r
+ LOAD1x1_2O 16, 16 \r
+ mtctr T8 \r
+ bl CGEMM_L1x1_K32 \r
+ b CGEMM_L1x1_SAVE \r
+ CMP1x1_32K:\r
+/*----------------------------------------*/ \r
+#if defined(TRMMKERNEL) \r
+ cmpwi T6,32\r
+#else \r
+ cmpwi K,32\r
+#endif \r
+ bne CGEMM_L1x1_SUB2 \r
+ MY_ALIGN \r
+ mtctr T8\r
+ addi BO,BO,-16\r
+ addi AO,AO,-16 \r
+ LOAD1x1_2O 16,16\r
+ bl CGEMM_L1x1_K32 \r
+ b CGEMM_L1x1_SAVE \r
+ MY_ALIGN \r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x1_SUB2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 16\r
+ ble CGEMM_L1x1_SUB2_8\r
+ bl CGEMM_1x1_L16_SUB \r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x1_SUB2_8:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 8\r
+ ble CGEMM_L1x1_SUB2_4\r
+ bl CGEMM_1x1_L8_SUB\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x1_SUB2_4:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 4\r
+ ble CGEMM_L1x1_SUB2_2\r
+ LOAD1x1_2\r
+ KERNEL1x1_L2 16,16, 0,0\r
+ KERNEL1x1_E2 16,16, 1,1\r
+ MY_ALIGN\r
+\r
+\r
+CGEMM_L1x1_SUB2_2:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 2\r
+ ble CGEMM_L1x1_SUB2_1\r
+ LOAD1x1_2\r
+ KERNEL1x1_E2 16,16, 0,1\r
+ MY_ALIGN \r
+\r
+\r
+CGEMM_L1x1_SUB2_1:\r
+/*----------------------------------------*/ \r
+ andi. T1,L, 1\r
+ ble CGEMM_L1x1_SAVE \r
+ KERNEL1x1\r
+\r
+ MY_ALIGN\r
+CGEMM_L1x1_SAVE:\r
+/*----------------------------------------*/ \r
+ \r
+ SAVE1x1\r
+#if defined(TRMMKERNEL) \r
+ REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1\r
+#endif \r
+\r
+\r
+CGEMM_L1x1_END:\r
+/*----------------------------------------*/ \r
+ slwi T1, K, 3\r
+\r
+ add B, B, T1\r
+#if defined(TRMMKERNEL) && !defined(LEFT) \r
+ addi TEMP_REG, TEMP_REG, 1\r
+#endif \r
+\r
+CGEMM_L1_END:\r
+\r
+\r
+\r
+\r
--- /dev/null
+\r
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+\r
+/**************************************************************************************\r
+* Abdelrauf(quickwritereader@gmail.com)\r
+* BLASTEST : OK\r
+* CTEST : OK\r
+* TEST : OK\r
+* LAPACK-TEST : OK\r
+**************************************************************************************/\r
+#define unit_size 8\r
+#define DISP32(ind,disp) (ind*unit_size*32+disp)\r
+#define DISP16(ind,disp) (ind*unit_size*16+disp)\r
+#define DISP8(ind,disp) (ind*unit_size*8+disp)\r
+#define DISP4(ind,disp) (ind*unit_size*4+disp)\r
+#define DISP2(ind,disp) (ind*unit_size*2+disp)\r
+#define DISP1(ind,disp) (ind*unit_size+disp)\r
+#define DISPX(disp) (disp)\r
+\r
+.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) \r
+ xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) \r
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) \r
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 \r
+#else // CC || CR || RC || RR \r
+ /*we will assume {-alpha_r,-alpha_i} for this case */\r
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/\r
+ xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1\r
+ /*we will negate alpha image instead to fix sign*/\r
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#endif\r
+.endm\r
+\r
+\r
+.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) \r
+ xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) \r
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 \r
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) \r
+ xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR\r
+ xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#else // CC || CR || RC || RR \r
+ /*we will assume {-alpha_r,-alpha_i} for this case */\r
+ /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/\r
+ xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1\r
+ /*we will negate alpha image instead to fix sign*/\r
+ xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI \r
+#endif\r
+.endm\r
+ \r
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */\r
+\r
+.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2\r
+ xvmulsp \VSOUT1,\VSINII, alpha_i \r
+ xvmulsp \VSOUT2,\VSINRR, alpha_i\r
+.endm\r
+\r
+/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */\r
+\r
+.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 \r
+ xvmsubasp \VSOUT1,\VSINRR, alpha_r\r
+ xvmaddasp \VSOUT2,\VSINII, alpha_r\r
+.endm\r
+\r
+/* macros for N=4 and M=8\r
+**********************************************************************************************/\r
+\r
+.macro Zero4x8\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
+ xxlxor vs48, vs48, vs48\r
+ xxlxor vs49, vs49, vs49\r
+ xxlxor vs50, vs50, vs50\r
+ xxlxor vs51, vs51, vs51\r
+ xxlxor vs52, vs52, vs52\r
+ xxlxor vs53, vs53, vs53\r
+ xxlxor vs54, vs54, vs54\r
+ xxlxor vs55, vs55, vs55\r
+ xxlxor vs56, vs56, vs56\r
+ xxlxor vs57, vs57, vs57\r
+ xxlxor vs58, vs58, vs58\r
+ xxlxor vs59, vs59, vs59\r
+ xxlxor vs60, vs60, vs60\r
+ xxlxor vs61, vs61, vs61\r
+ xxlxor vs62, vs62, vs62\r
+ xxlxor vs63, vs63, vs63\r
+.endm\r
+\r
+\r
+.macro LOAD4x8 \r
+ LOAD4x8O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD4x8O OffsetA,OffsetB\r
+ lxv vs24, (\OffsetB+0)(BO)\r
+ lxv vs28, (\OffsetB+16)(BO)\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ lxv vs1, (\OffsetA+16)(AO)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
+ lxv vs2, (\OffsetA+32)(AO)\r
+ lxv vs3, (\OffsetA+48)(AO) \r
+ xxpermdi vs27, vs26, vs26,2 \r
+ xxpermdi vs31, vs30, vs30,2 \r
+.endm\r
+\r
+\r
+.macro END4x8_NORMAL\r
+ END4x8 AO,BO,64,32\r
+.endm\r
+\r
+\r
+.macro END4x8_WITHOUT_ADD\r
+ END4x8 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END4x8 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs34, vs2,vs24 \r
+ xvmaddasp vs35, vs3,vs24 \r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs38, vs2,vs25 \r
+ xvmaddasp vs39, vs3,vs25 \r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs42, vs2,vs26 \r
+ xvmaddasp vs43, vs3,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+ xvmaddasp vs46, vs2,vs27 \r
+ xvmaddasp vs47, vs3,vs27\r
+ xvmaddasp vs48, vs0,vs28\r
+ xvmaddasp vs49, vs1,vs28\r
+ xvmaddasp vs50, vs2,vs28 \r
+ xvmaddasp vs51, vs3,vs28 \r
+ xvmaddasp vs52, vs0,vs29\r
+ xvmaddasp vs53, vs1,vs29\r
+ xvmaddasp vs54, vs2,vs29 \r
+ xvmaddasp vs55, vs3,vs29\r
+ xvmaddasp vs56, vs0,vs30\r
+ xvmaddasp vs57, vs1,vs30\r
+ xvmaddasp vs58, vs2,vs30 \r
+ xvmaddasp vs59, vs3,vs30\r
+ xvmaddasp vs60, vs0,vs31\r
+ xvmaddasp vs61, vs1,vs31\r
+ xvmaddasp vs62, vs2,vs31 \r
+ xvmaddasp vs63, vs3,vs31 \r
+.endm\r
+\r
+\r
+.macro LOAD4x8_2\r
+ LOAD4x8_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD4x8_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetB)(BO)\r
+ lxv vs12, (16+\OffsetB)(BO)\r
+ lxv vs24, (32+\OffsetB)(BO)\r
+ lxv vs28, (32+16+\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs5, (16+\OffsetA)(AO)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs14, vs12, permute_mask \r
+ lxv vs6, (32+\OffsetA)(AO)\r
+ lxv vs7, (48+\OffsetA)(AO) \r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs13, vs12, vs12,2 \r
+ lxv vs0, (64+\OffsetA)(AO)\r
+ lxv vs1, (64+16+\OffsetA)(AO) \r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs15, vs14, vs14,2 \r
+ lxv vs2, (64+32+\OffsetA)(AO)\r
+ lxv vs3, (64+48+\OffsetA)(AO)\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+ xxpermdi vs31, vs30, vs30,2 \r
+.endm\r
+ \r
+\r
+.macro END4x8_2 \r
+ /*for load2 offset will be 128 and 64*/\r
+ KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs48, vs4,vs12\r
+ xvmaddasp vs49, vs5,vs12\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+ xvmaddasp vs56, vs4,vs14\r
+ xvmaddasp vs57, vs5,vs14\r
+ xvmaddasp vs36, vs4,vs9\r
+ xvmaddasp vs37, vs5,vs9\r
+ xvmaddasp vs52, vs4,vs13\r
+ xvmaddasp vs53, vs5,vs13\r
+ xvmaddasp vs44, vs4,vs11\r
+ xvmaddasp vs45, vs5,vs11\r
+ xvmaddasp vs60, vs4,vs15\r
+ xvmaddasp vs61, vs5,vs15\r
+.if \Complete==0 \r
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+ xvmaddasp vs34, vs6,vs8 \r
+ xvmaddasp vs35, vs7,vs8 \r
+ xvmaddasp vs50, vs6,vs12\r
+ xvmaddasp vs51, vs7,vs12\r
+.if \Complete==0 \r
+ lxv vs8, DISP8(\Index,\OffsetB)(\BREG)\r
+ lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs42, vs6,vs10\r
+ xvmaddasp vs43, vs7,vs10\r
+ xvmaddasp vs58, vs6,vs14\r
+ xvmaddasp vs59, vs7,vs14\r
+.if \Complete==0 \r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs14, vs12, permute_mask \r
+.endif \r
+ xvmaddasp vs38, vs6,vs9 \r
+ xvmaddasp vs39, vs7,vs9 \r
+ xvmaddasp vs54, vs6,vs13\r
+ xvmaddasp vs55, vs7,vs13\r
+.if \Complete==0\r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs13, vs12, vs12,2 \r
+.endif \r
+ xvmaddasp vs46, vs6,vs11\r
+ xvmaddasp vs47, vs7,vs11\r
+ xvmaddasp vs62, vs6,vs15\r
+ xvmaddasp vs63, vs7,vs15\r
+.if \Complete==0\r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs15, vs14, vs14,2 \r
+.endif \r
+.if \Complete==0\r
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs48, vs0,vs28\r
+ xvmaddasp vs49, vs1,vs28\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs56, vs0,vs30\r
+ xvmaddasp vs57, vs1,vs30\r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs52, vs0,vs29\r
+ xvmaddasp vs53, vs1,vs29\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+ xvmaddasp vs60, vs0,vs31\r
+ xvmaddasp vs61, vs1,vs31 \r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)\r
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) \r
+.endif\r
+\r
+ xvmaddasp vs34, vs2,vs24\r
+ xvmaddasp vs35, vs3,vs24 \r
+ xvmaddasp vs50, vs2,vs28\r
+ xvmaddasp vs51, vs3,vs28\r
+.if \Complete==0\r
+ lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)\r
+ lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs42, vs2,vs26\r
+ xvmaddasp vs43, vs3,vs26\r
+ xvmaddasp vs58, vs2,vs30\r
+ xvmaddasp vs59, vs3,vs30\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+.endif \r
+ xvmaddasp vs38, vs2,vs25\r
+ xvmaddasp vs39, vs3,vs25\r
+ xvmaddasp vs54, vs2,vs29\r
+ xvmaddasp vs55, vs3,vs29\r
+.if \Complete==0\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
+.endif \r
+ xvmaddasp vs46, vs2,vs27\r
+ xvmaddasp vs47, vs3,vs27\r
+ xvmaddasp vs62, vs2,vs31 \r
+ xvmaddasp vs63, vs3,vs31\r
+.if \Complete==0\r
+ xxpermdi vs27, vs26, vs26,2 \r
+ xxpermdi vs31, vs30, vs30,2 \r
+.endif\r
+\r
+.if \Complete==0\r
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)\r
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP8(\Index,64)\r
+ addi \AREG, \AREG, DISP16(\Index,128) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL4x8\r
+ LOAD4x8\r
+ END4x8 AO, BO, 64,32\r
+.endm\r
+\r
+\r
+.macro SAVE4x8\r
+ add T4, LDC,LDC\r
+ add T1, CO ,LDC \r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+ lxv vs25 , 16(CO)\r
+#endif\r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 32(CO)\r
+ lxv vs27 , 48(CO)\r
+#endif \r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs28 , 0(T1)\r
+ lxv vs29 , 16(T1)\r
+#endif \r
+ xxperm vs2,vs34,permute_mask\r
+ xxperm vs6,vs42,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs30 , 32(T1)\r
+ lxv vs31 , 48(T1)\r
+#endif \r
+ xxperm vs3,vs35,permute_mask\r
+ xxperm vs7,vs43,permute_mask \r
+ add T2,CO,T4\r
+ add T3,T1,T4 \r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ xxperm vs8,vs36,permute_mask\r
+ xxperm vs12,vs44,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5\r
+ xxperm vs9,vs37,permute_mask\r
+ xxperm vs13,vs45,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6\r
+ xxperm vs10,vs38,permute_mask\r
+ xxperm vs14,vs46,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 \r
+ xxperm vs11,vs39,permute_mask\r
+ xxperm vs15,vs47,permute_mask \r
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12\r
+ xxperm vs0,vs48,permute_mask\r
+ xxperm vs4,vs56,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13\r
+ xxperm vs1,vs49,permute_mask\r
+ xxperm vs5,vs57,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14\r
+ xxperm vs2,vs50,permute_mask\r
+ xxperm vs6,vs58,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 \r
+ xxperm vs3,vs51,permute_mask\r
+ xxperm vs7,vs59,permute_mask \r
+ AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4\r
+ xxperm vs8,vs52,permute_mask\r
+ xxperm vs12,vs60,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5\r
+ xxperm vs9,vs53,permute_mask\r
+ xxperm vs13,vs61,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6\r
+ xxperm vs10,vs54,permute_mask\r
+ xxperm vs14,vs62,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 \r
+ xxperm vs11,vs55,permute_mask\r
+ xxperm vs15,vs63,permute_mask \r
+ AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12\r
+ AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1\r
+ AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14\r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 \r
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5\r
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5\r
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7 \r
+ #ifndef TRMMKERNEL \r
+ lxv vs32 , 0(T2)\r
+ lxv vs40 , 16(T2)\r
+#endif \r
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11\r
+#ifndef TRMMKERNEL \r
+ lxv vs33 , 32(T2)\r
+ lxv vs41 , 48(T2)\r
+#endif \r
+ MULT_APLHA_PART1 vs38,vs46,vs12,vs13\r
+ MULT_APLHA_PART1 vs39,vs47,vs14,vs15\r
+#ifndef TRMMKERNEL \r
+ lxv vs34 , 0(T3)\r
+ lxv vs42 , 16(T3)\r
+#endif \r
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11\r
+#ifndef TRMMKERNEL \r
+ lxv vs35 , 32(T3)\r
+ lxv vs43 , 48(T3)\r
+#endif \r
+ MULT_APLHA_PART2 vs38,vs46,vs12,vs13\r
+ MULT_APLHA_PART2 vs39,vs47,vs14,vs15\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+ xxperm vs4,vs5, save_permute_1\r
+ xxperm vs6,vs7, save_permute_1\r
+ xxperm vs8,vs9, save_permute_1\r
+ xxperm vs10,vs11, save_permute_1\r
+ xxperm vs12,vs13, save_permute_1\r
+ xxperm vs14,vs15, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,2\r
+ xxpermdi vs3,vs10,vs2,2\r
+ xxpermdi vs5,vs12,vs4,2\r
+ xxpermdi vs7,vs14,vs6,2\r
+ xxpermdi vs9,vs0,vs8,2\r
+ xxpermdi vs11,vs2,vs10,2 \r
+ xvaddsp vs24,vs24,vs1\r
+ xvaddsp vs25,vs25,vs3\r
+ xxpermdi vs13,vs4,vs12,2 \r
+ xxpermdi vs15,vs6,vs14,2\r
+ xvaddsp vs26,vs26,vs5\r
+ xvaddsp vs27,vs27,vs7\r
+ xvaddsp vs28,vs28,vs9\r
+ xvaddsp vs29,vs29,vs11 \r
+ xvaddsp vs30,vs30,vs13\r
+ xvaddsp vs31,vs31,vs15 \r
+#else\r
+ xxpermdi vs24,vs8,vs0,2\r
+ xxpermdi vs25,vs10,vs2,2\r
+ xxpermdi vs26,vs12,vs4,2\r
+ xxpermdi vs27,vs14,vs6,2 \r
+ xxpermdi vs28,vs0,vs8,2\r
+ xxpermdi vs29,vs2,vs10,2 \r
+ xxpermdi vs30,vs4,vs12,2 \r
+ xxpermdi vs31,vs6,vs14,2\r
+#endif\r
+ stxv vs24 , 0(CO)\r
+ stxv vs25 , 16(CO)\r
+ MULT_APLHA_PART1 vs48,vs56,vs0,vs1\r
+ MULT_APLHA_PART1 vs49,vs57,vs2,vs3\r
+ stxv vs26 , 32(CO)\r
+ stxv vs27 , 48(CO)\r
+ MULT_APLHA_PART1 vs50,vs58,vs4,vs5\r
+ MULT_APLHA_PART1 vs51,vs59,vs6,vs7\r
+ stxv vs28 , 0(T1)\r
+ stxv vs29 , 16(T1)\r
+ MULT_APLHA_PART2 vs48,vs56,vs0,vs1\r
+ MULT_APLHA_PART2 vs49,vs57,vs2,vs3\r
+ stxv vs30 , 32(T1)\r
+ stxv vs31 , 48(T1) \r
+ MULT_APLHA_PART2 vs50,vs58,vs4,vs5\r
+ MULT_APLHA_PART2 vs51,vs59,vs6,vs7\r
+ MULT_APLHA_PART1 vs52,vs60,vs8,vs9\r
+ MULT_APLHA_PART1 vs53,vs61,vs10,vs11\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+ MULT_APLHA_PART1 vs54,vs62,vs12,vs13\r
+ MULT_APLHA_PART1 vs55,vs63,vs14,vs15\r
+ xxperm vs4,vs5, save_permute_1\r
+ xxperm vs6,vs7, save_permute_1\r
+ MULT_APLHA_PART2 vs52,vs60,vs8,vs9\r
+ MULT_APLHA_PART2 vs53,vs61,vs10,vs11\r
+ xxperm vs8,vs9, save_permute_1\r
+ xxperm vs10,vs11, save_permute_1\r
+ MULT_APLHA_PART2 vs54,vs62,vs12,vs13\r
+ MULT_APLHA_PART2 vs55,vs63,vs14,vs15\r
+ xxperm vs12,vs13, save_permute_1\r
+ xxperm vs14,vs15, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,2\r
+ xxpermdi vs3,vs10,vs2,2\r
+ xxpermdi vs5,vs12,vs4,2\r
+ xxpermdi vs7,vs14,vs6,2\r
+ xxpermdi vs9,vs0,vs8,2\r
+ xxpermdi vs11,vs2,vs10,2 \r
+ xvaddsp vs32,vs32,vs1\r
+ xvaddsp vs40,vs40,vs3\r
+ xxpermdi vs13,vs4,vs12,2 \r
+ xxpermdi vs15,vs6,vs14,2\r
+ xvaddsp vs33,vs33,vs5\r
+ xvaddsp vs41,vs41,vs7\r
+ xvaddsp vs34,vs34,vs9\r
+ xvaddsp vs42,vs42,vs11 \r
+ xvaddsp vs35,vs35,vs13\r
+ xvaddsp vs43,vs43,vs15 \r
+#else\r
+ xxpermdi vs32,vs8,vs0,2\r
+ xxpermdi vs40,vs10,vs2,2\r
+ xxpermdi vs33,vs12,vs4,2\r
+ xxpermdi vs41,vs14,vs6,2 \r
+ xxpermdi vs34,vs0,vs8,2\r
+ xxpermdi vs42,vs2,vs10,2 \r
+ xxpermdi vs35,vs4,vs12,2 \r
+ xxpermdi vs43,vs6,vs14,2\r
+#endif\r
+ stxv vs32 , 0(T2)\r
+ stxv vs40 , 16(T2)\r
+ stxv vs33 , 32(T2)\r
+ stxv vs41 , 48(T2)\r
+ stxv vs34 , 0(T3)\r
+ stxv vs42 , 16(T3)\r
+ stxv vs35 , 32(T3)\r
+ stxv vs43 , 48(T3) \r
+ addi CO, CO, 64\r
+.endm\r
+\r
+/* macros for N=4 and M=4\r
+**********************************************************************************************/\r
+\r
+.macro Zero4x4\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs48, vs48, vs48\r
+ xxlxor vs49, vs49, vs49\r
+ xxlxor vs52, vs52, vs52\r
+ xxlxor vs53, vs53, vs53\r
+ xxlxor vs56, vs56, vs56\r
+ xxlxor vs57, vs57, vs57\r
+ xxlxor vs60, vs60, vs60\r
+ xxlxor vs61, vs61, vs61\r
+.endm\r
+\r
+\r
+.macro LOAD4x4 \r
+ LOAD4x4O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD4x4O OffsetA,OffsetB\r
+ lxv vs24, (\OffsetB+0)(BO)\r
+ lxv vs28, (\OffsetB+16)(BO)\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ lxv vs1, (\OffsetA+16)(AO)\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+ xxpermdi vs31, vs30, vs30,2 \r
+.endm\r
+\r
+\r
+.macro END4x4_NORMAL\r
+ END4x4 AO,BO,32,32\r
+.endm\r
+\r
+\r
+.macro END4x4_WITHOUT_ADD\r
+ END4x4 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END4x4 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+ xvmaddasp vs48, vs0,vs28\r
+ xvmaddasp vs49, vs1,vs28\r
+ xvmaddasp vs52, vs0,vs29\r
+ xvmaddasp vs53, vs1,vs29\r
+ xvmaddasp vs56, vs0,vs30\r
+ xvmaddasp vs57, vs1,vs30\r
+ xvmaddasp vs60, vs0,vs31\r
+ xvmaddasp vs61, vs1,vs31\r
+.endm\r
+\r
+\r
+.macro LOAD4x4_2\r
+ LOAD4x4_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD4x4_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetB)(BO)\r
+ lxv vs12, (16+\OffsetB)(BO)\r
+ lxv vs24, (32+\OffsetB)(BO)\r
+ lxv vs28, (32+16+\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs5, (16+\OffsetA)(AO)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs14, vs12, permute_mask \r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs13, vs12, vs12,2 \r
+ lxv vs0, (32+\OffsetA)(AO)\r
+ lxv vs1, (32+16+\OffsetA)(AO) \r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs15, vs14, vs14,2 \r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+ xxpermdi vs31, vs30, vs30,2 \r
+.endm\r
+\r
+\r
+.macro END4x4_2 \r
+ /*for load2 offset will be 64 and 64*/\r
+ KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs48, vs4,vs12\r
+ xvmaddasp vs49, vs5,vs12\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+ xvmaddasp vs56, vs4,vs14\r
+ xvmaddasp vs57, vs5,vs14\r
+.if \Complete==0 \r
+ lxv vs8, DISP8(\Index,\OffsetB)(\BREG)\r
+ lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs36, vs4,vs9\r
+ xvmaddasp vs37, vs5,vs9\r
+ xvmaddasp vs52, vs4,vs13\r
+ xvmaddasp vs53, vs5,vs13\r
+.if \Complete==0 \r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs14, vs12, permute_mask \r
+.endif \r
+ xvmaddasp vs44, vs4,vs11\r
+ xvmaddasp vs45, vs5,vs11\r
+ xvmaddasp vs60, vs4,vs15\r
+ xvmaddasp vs61, vs5,vs15\r
+.if \Complete==0\r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs13, vs12, vs12,2 \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs15, vs14, vs14,2 \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs48, vs0,vs28\r
+ xvmaddasp vs49, vs1,vs28\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs56, vs0,vs30\r
+ xvmaddasp vs57, vs1,vs30\r
+.if \Complete==0\r
+ lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)\r
+ lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs52, vs0,vs29\r
+ xvmaddasp vs53, vs1,vs29\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask\r
+ xxperm vs30, vs28, permute_mask \r
+.endif \r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+ xvmaddasp vs60, vs0,vs31\r
+ xvmaddasp vs61, vs1,vs31 \r
+.if \Complete==0\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs29, vs28, vs28,2 \r
+.endif \r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) \r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs27, vs26, vs26,2 \r
+ xxpermdi vs31, vs30, vs30,2 \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP8(\Index,64)\r
+ addi \AREG, \AREG, DISP8(\Index,64) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL4x4\r
+ LOAD4x4\r
+ END4x4 AO, BO, 32,32\r
+.endm\r
+\r
+\r
+.macro SAVE4x4\r
+ add T4, LDC,LDC\r
+ add T1, CO ,LDC \r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+ lxv vs25 , 16(CO)\r
+#endif\r
+ add T2,CO,T4\r
+ add T3,T1,T4 \r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 0(T1)\r
+ lxv vs27 , 16(T1)\r
+#endif \r
+ #ifndef TRMMKERNEL \r
+ lxv vs28 , 0(T2)\r
+ lxv vs29 , 16(T2)\r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxv vs30 , 0(T3)\r
+ lxv vs31 , 16(T3)\r
+#endif \r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask\r
+ xxperm vs8,vs36,permute_mask\r
+ xxperm vs12,vs44,permute_mask\r
+ xxperm vs9,vs37,permute_mask\r
+ xxperm vs13,vs45,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5\r
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12\r
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13\r
+ xxperm vs0,vs48,permute_mask\r
+ xxperm vs4,vs56,permute_mask\r
+ xxperm vs1,vs49,permute_mask\r
+ xxperm vs5,vs57,permute_mask \r
+ xxperm vs8,vs52,permute_mask\r
+ xxperm vs12,vs60,permute_mask\r
+ xxperm vs9,vs53,permute_mask\r
+ xxperm vs13,vs61,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4\r
+ AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5\r
+ AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12\r
+ AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1\r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11\r
+ MULT_APLHA_PART1 vs48,vs56,vs4,vs5\r
+ MULT_APLHA_PART1 vs49,vs57,vs6,vs7 \r
+ MULT_APLHA_PART1 vs52,vs60,vs12,vs13\r
+ MULT_APLHA_PART1 vs53,vs61,vs14,vs15\r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11\r
+ MULT_APLHA_PART2 vs48,vs56,vs4,vs5\r
+ MULT_APLHA_PART2 vs49,vs57,vs6,vs7 \r
+ MULT_APLHA_PART2 vs52,vs60,vs12,vs13\r
+ MULT_APLHA_PART2 vs53,vs61,vs14,vs15\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+ xxperm vs8,vs9, save_permute_1\r
+ xxperm vs10,vs11, save_permute_1\r
+ xxperm vs4,vs5, save_permute_1\r
+ xxperm vs6,vs7, save_permute_1\r
+ xxperm vs12,vs13, save_permute_1\r
+ xxperm vs14,vs15, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,2\r
+ xxpermdi vs3,vs10,vs2,2 \r
+ xxpermdi vs9,vs0,vs8,2\r
+ xxpermdi vs11,vs2,vs10,2 \r
+ xxpermdi vs5,vs12,vs4,2\r
+ xxpermdi vs7,vs14,vs6,2 \r
+ xxpermdi vs13,vs4,vs12,2\r
+ xxpermdi vs15,vs6,vs14,2 \r
+ xvaddsp vs24,vs24,vs1\r
+ xvaddsp vs25,vs25,vs3 \r
+ xvaddsp vs26,vs26,vs9\r
+ xvaddsp vs27,vs27,vs11 \r
+ xvaddsp vs28,vs28,vs5\r
+ xvaddsp vs29,vs29,vs7 \r
+ xvaddsp vs30,vs30,vs13\r
+ xvaddsp vs31,vs31,vs15 \r
+#else\r
+ xxpermdi vs24,vs8,vs0,2\r
+ xxpermdi vs25,vs10,vs2,2\r
+ xxpermdi vs26,vs0,vs8,2\r
+ xxpermdi vs27,vs2,vs10,2 \r
+ xxpermdi vs28,vs12,vs4,2\r
+ xxpermdi vs29,vs14,vs6,2 \r
+ xxpermdi vs30,vs4,vs12,2\r
+ xxpermdi vs31,vs6,vs14,2 \r
+#endif\r
+ stxv vs24 , 0(CO)\r
+ stxv vs25 , 16(CO)\r
+ stxv vs26 , 0(T1)\r
+ stxv vs27 , 16(T1)\r
+ stxv vs28 , 0(T2)\r
+ stxv vs29 , 16(T2)\r
+ stxv vs30 , 0(T3)\r
+ stxv vs31 , 16(T3) \r
+ addi CO, CO, 32\r
+.endm\r
+\r
+/* macros for N=4 and M=2\r
+**********************************************************************************************/\r
+\r
+.macro Zero4x2\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+.endm\r
+\r
+\r
+.macro LOAD4x2 \r
+ LOAD4x2O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD4x2O OffsetA,OffsetB\r
+ lxv vs24, (\OffsetA+0)(AO)\r
+ lxv vs0, (\OffsetB+0)(BO)\r
+ lxv vs1, (\OffsetB+16)(BO)\r
+ xxperm vs26, vs24, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+\r
+\r
+.macro END4x2_NORMAL\r
+ END4x2 AO,BO,16,32\r
+.endm\r
+\r
+\r
+.macro END4x2_WITHOUT_ADD\r
+ END4x2 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END4x2 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+.endm\r
+\r
+\r
+.macro LOAD4x2_2\r
+ LOAD4x2_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD4x2_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetA)(AO) \r
+ lxv vs24, (16+\OffsetA)(AO) \r
+ lxv vs4, (0+\OffsetB)(BO)\r
+ lxv vs5, (16+\OffsetB)(BO)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxperm vs26, vs24, permute_mask\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ lxv vs0, (32+\OffsetB)(BO)\r
+ lxv vs1, (32+16+\OffsetB)(BO) \r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+\r
+\r
+.macro END4x2_2 \r
+ /*for load2 offset will be 32 and 64*/\r
+ KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+.if \Complete==0 \r
+ lxv vs8, DISP4(\Index,\OffsetA)(\AREG) \r
+.endif \r
+ xvmaddasp vs36, vs4,vs9\r
+ xvmaddasp vs37, vs5,vs9\r
+ xvmaddasp vs44, vs4,vs11\r
+ xvmaddasp vs45, vs5,vs11\r
+.if \Complete==0 \r
+ xxperm vs10, vs8, permute_mask \r
+ xxpermdi vs9, vs8, vs8,2 \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG)\r
+ lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG)\r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs11, vs10, vs10,2 \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.if \Complete==0\r
+ lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) \r
+.endif \r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+.endif \r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG)\r
+ lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) \r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA) \r
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP4(\Index,32) \r
+ addi \BREG, \BREG, DISP8(\Index,64)\r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL4x2\r
+ LOAD4x2\r
+ END4x2 AO, BO, 16,32\r
+.endm\r
+\r
+\r
+.macro SAVE4x2\r
+ add T4, LDC,LDC\r
+ add T1, CO ,LDC \r
+ add T2,CO,T4\r
+ add T3,T1,T4 \r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO) \r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxv vs25 , 0(T1) \r
+#endif \r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 0(T2) \r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxv vs27 , 0(T3) \r
+#endif \r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask \r
+ xxperm vs8,vs36,permute_mask\r
+ xxperm vs12,vs44,permute_mask\r
+ xxperm vs9,vs37,permute_mask\r
+ xxperm vs13,vs45,permute_mask\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1\r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11\r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+ xxperm vs8,vs9, save_permute_1\r
+ xxperm vs10,vs11, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,0\r
+ xxpermdi vs9,vs10,vs2,0 \r
+ xxpermdi vs3,vs0,vs8,3\r
+ xxpermdi vs11,vs2,vs10,3 \r
+ xvaddsp vs24,vs24,vs1\r
+ xvaddsp vs26,vs26,vs9 \r
+ xvaddsp vs25,vs25,vs3 \r
+ xvaddsp vs27,vs27,vs11 \r
+#else\r
+ xxpermdi vs24,vs8,vs0,0\r
+ xxpermdi vs26,vs10,vs2,0 \r
+ xxpermdi vs25,vs0,vs8,3\r
+ xxpermdi vs27,vs2,vs10,3 \r
+#endif\r
+ stxv vs24 , 0(CO) \r
+ stxv vs25 , 0(T1) \r
+ stxv vs26 , 0(T2) \r
+ stxv vs27 , 0(T3) \r
+ addi CO, CO, 16\r
+.endm\r
+\r
+/* macros for N=4 and M=2\r
+**********************************************************************************************/\r
+\r
+.macro Zero4x1\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33 \r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41 \r
+.endm\r
+\r
+\r
+.macro LOAD4x1 \r
+ LOAD4x1O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD4x1O OffsetA,OffsetB\r
+ lxsd v4, (\OffsetA+0)(AO) \r
+ lxv vs0, (\OffsetB+0)(BO)\r
+ lxv vs1, (\OffsetB+16)(BO)\r
+ xxspltd vs24,vs36,0\r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+\r
+\r
+.macro END4x1_NORMAL\r
+ END4x1 AO,BO,8,32\r
+.endm\r
+\r
+\r
+.macro END4x1_WITHOUT_ADD\r
+ END4x1 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END4x1 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.endm\r
+\r
+\r
+.macro LOAD4x1_2\r
+ LOAD4x1_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD4x1_2O OffsetA,OffsetB\r
+ lxv vs27, (\OffsetA)(AO) \r
+ xxspltd vs8,vs27,1\r
+ xxspltd vs24,vs27,0 \r
+ lxv vs4, (0+\OffsetB)(BO)\r
+ lxv vs5, (16+\OffsetB)(BO) \r
+ xxperm vs10, vs8, permute_mask \r
+ xxperm vs26, vs24, permute_mask \r
+ lxv vs0, (32+\OffsetB)(BO)\r
+ lxv vs1, (32+16+\OffsetB)(BO)\r
+.endm\r
+\r
+\r
+.macro END4x1_2 \r
+ /*for load2 offset will be 16 and 64*/\r
+ KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+.if \Complete==0 \r
+ lxv vs27, DISP2(\Index,\OffsetA)(\AREG) \r
+ xxspltd vs8,vs27,1 \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG)\r
+ lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG)\r
+.endif\r
+\r
+.if \Complete==0 \r
+ xxperm vs10, vs8, permute_mask \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.if \Complete==0 \r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs26, vs24, permute_mask \r
+.endif \r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG)\r
+ lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA) \r
+ addi \BREG, \BREG, DISP8(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP2(\Index,16) \r
+ addi \BREG, \BREG, DISP8(\Index,64)\r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL4x1\r
+ LOAD4x1\r
+ END4x1 AO, BO, 8,32\r
+.endm\r
+\r
+\r
+.macro SAVE4x1\r
+ add T4, LDC,LDC\r
+ add T1, CO ,LDC \r
+ add T2,CO,T4\r
+ add T3,T1,T4 \r
+#ifndef TRMMKERNEL \r
+ lxsd v4 , 0(CO) \r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxsd v5 , 0(T1) \r
+#endif \r
+#ifndef TRMMKERNEL \r
+ lxsd v6 , 0(T2) \r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxsd v7 , 0(T3) \r
+#endif \r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask \r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1\r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxspltd vs1,vs0,0\r
+ xxspltd vs3,vs0,1\r
+ xxspltd vs9,vs2,0\r
+ xxspltd vs11,vs2,1\r
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/\r
+ xvaddsp vs36,vs36,vs1\r
+ xvaddsp vs37,vs37,vs3 \r
+ xvaddsp vs38,vs38,vs9 \r
+ xvaddsp vs39,vs39,vs11 \r
+#else \r
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/\r
+ xxspltd vs36,vs0,0\r
+ xxspltd vs37,vs0,1\r
+ xxspltd vs38,vs2,0\r
+ xxspltd vs39,vs2,1\r
+#endif\r
+ stxsd v4 , 0(CO) \r
+ stxsd v5 , 0(T1) \r
+ stxsd v6 , 0(T2) \r
+ stxsd v7 , 0(T3) \r
+ addi CO, CO, 8\r
+.endm\r
+\r
+/* macros for N=2 and M=8\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x8\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
+.endm\r
+\r
+\r
+.macro LOAD2x8 \r
+ LOAD2x8O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD2x8O OffsetA,OffsetB\r
+ lxv vs24, (\OffsetB+0)(BO) \r
+ xxperm vs26, vs24, permute_mask \r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ lxv vs1, (\OffsetA+16)(AO)\r
+ lxv vs2, (\OffsetA+32)(AO)\r
+ lxv vs3, (\OffsetA+48)(AO) \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2\r
+.endm\r
+\r
+\r
+.macro END2x8_NORMAL\r
+ END2x8 AO,BO,64,16\r
+.endm\r
+\r
+\r
+.macro END2x8_WITHOUT_ADD\r
+ END2x8 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x8 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs34, vs2,vs24 \r
+ xvmaddasp vs35, vs3,vs24 \r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs38, vs2,vs25 \r
+ xvmaddasp vs39, vs3,vs25 \r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs42, vs2,vs26 \r
+ xvmaddasp vs43, vs3,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+ xvmaddasp vs46, vs2,vs27 \r
+ xvmaddasp vs47, vs3,vs27\r
+.endm\r
+\r
+\r
+.macro LOAD2x8_2\r
+ LOAD2x8_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD2x8_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetB)(BO)\r
+ lxv vs24, (16+\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs5, (16+\OffsetA)(AO)\r
+ xxperm vs10, vs8, permute_mask \r
+ xxperm vs26, vs24, permute_mask \r
+ lxv vs6, (32+\OffsetA)(AO)\r
+ lxv vs7, (48+\OffsetA)(AO) \r
+ lxv vs0, (64+\OffsetA)(AO)\r
+ lxv vs1, (64+16+\OffsetA)(AO) \r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ lxv vs2, (64+32+\OffsetA)(AO)\r
+ lxv vs3, (64+48+\OffsetA)(AO)\r
+ xxpermdi vs11, vs10, vs10,2\r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+ \r
+\r
+.macro END2x8_2 \r
+ /*for load2 offset will be 128 and 32*/\r
+ KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+ xvmaddasp vs36, vs4,vs9\r
+ xvmaddasp vs37, vs5,vs9\r
+ xvmaddasp vs44, vs4,vs11\r
+ xvmaddasp vs45, vs5,vs11\r
+.if \Complete==0 \r
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+ xvmaddasp vs34, vs6,vs8 \r
+ xvmaddasp vs35, vs7,vs8\r
+.if \Complete==0 \r
+ lxv vs8, DISP4(\Index,\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs42, vs6,vs10\r
+ xvmaddasp vs43, vs7,vs10\r
+ xvmaddasp vs38, vs6,vs9 \r
+ xvmaddasp vs39, vs7,vs9\r
+.if \Complete==0\r
+ xxperm vs10, vs8, permute_mask \r
+ xxpermdi vs9, vs8, vs8,2 \r
+.endif \r
+ xvmaddasp vs46, vs6,vs11\r
+ xvmaddasp vs47, vs7,vs11\r
+.if \Complete==0\r
+ xxpermdi vs11, vs10, vs10,2 \r
+.endif \r
+.if \Complete==0\r
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)\r
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) \r
+.endif\r
+\r
+ xvmaddasp vs34, vs2,vs24\r
+ xvmaddasp vs35, vs3,vs24 \r
+.if \Complete==0\r
+ lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs42, vs2,vs26\r
+ xvmaddasp vs43, vs3,vs26\r
+ xvmaddasp vs38, vs2,vs25\r
+ xvmaddasp vs39, vs3,vs25\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+.endif \r
+ xvmaddasp vs46, vs2,vs27\r
+ xvmaddasp vs47, vs3,vs27\r
+.if \Complete==0\r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endif\r
+\r
+.if \Complete==0\r
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)\r
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP4(\Index,32)\r
+ addi \AREG, \AREG, DISP16(\Index,128) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL2x8\r
+ LOAD2x8\r
+ END2x8 AO, BO, 64,16\r
+.endm\r
+\r
+\r
+.macro SAVE2x8\r
+ add T1, CO ,LDC \r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+ lxv vs25 , 16(CO)\r
+#endif\r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 32(CO)\r
+ lxv vs27 , 48(CO)\r
+#endif \r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs28 , 0(T1)\r
+ lxv vs29 , 16(T1)\r
+#endif \r
+ xxperm vs2,vs34,permute_mask\r
+ xxperm vs6,vs42,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs30 , 32(T1)\r
+ lxv vs31 , 48(T1)\r
+#endif \r
+ xxperm vs3,vs35,permute_mask\r
+ xxperm vs7,vs43,permute_mask \r
+ add T2,CO,T4\r
+ add T3,T1,T4 \r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ xxperm vs8,vs36,permute_mask\r
+ xxperm vs12,vs44,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5\r
+ xxperm vs9,vs37,permute_mask\r
+ xxperm vs13,vs45,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6\r
+ xxperm vs10,vs38,permute_mask\r
+ xxperm vs14,vs46,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 \r
+ xxperm vs11,vs39,permute_mask\r
+ xxperm vs15,vs47,permute_mask \r
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12\r
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13\r
+ AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14\r
+ AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 \r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5\r
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5\r
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7 \r
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11\r
+ MULT_APLHA_PART1 vs38,vs46,vs12,vs13\r
+ MULT_APLHA_PART1 vs39,vs47,vs14,vs15\r
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11\r
+ MULT_APLHA_PART2 vs38,vs46,vs12,vs13\r
+ MULT_APLHA_PART2 vs39,vs47,vs14,vs15\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+ xxperm vs4,vs5, save_permute_1\r
+ xxperm vs6,vs7, save_permute_1\r
+ xxperm vs8,vs9, save_permute_1\r
+ xxperm vs10,vs11, save_permute_1\r
+ xxperm vs12,vs13, save_permute_1\r
+ xxperm vs14,vs15, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,2\r
+ xxpermdi vs3,vs10,vs2,2\r
+ xxpermdi vs5,vs12,vs4,2\r
+ xxpermdi vs7,vs14,vs6,2\r
+ xxpermdi vs9,vs0,vs8,2\r
+ xxpermdi vs11,vs2,vs10,2 \r
+ xvaddsp vs24,vs24,vs1\r
+ xvaddsp vs25,vs25,vs3\r
+ xxpermdi vs13,vs4,vs12,2 \r
+ xxpermdi vs15,vs6,vs14,2\r
+ xvaddsp vs26,vs26,vs5\r
+ xvaddsp vs27,vs27,vs7\r
+ xvaddsp vs28,vs28,vs9\r
+ xvaddsp vs29,vs29,vs11 \r
+ xvaddsp vs30,vs30,vs13\r
+ xvaddsp vs31,vs31,vs15 \r
+#else\r
+ xxpermdi vs24,vs8,vs0,2\r
+ xxpermdi vs25,vs10,vs2,2\r
+ xxpermdi vs26,vs12,vs4,2\r
+ xxpermdi vs27,vs14,vs6,2 \r
+ xxpermdi vs28,vs0,vs8,2\r
+ xxpermdi vs29,vs2,vs10,2 \r
+ xxpermdi vs30,vs4,vs12,2 \r
+ xxpermdi vs31,vs6,vs14,2\r
+#endif\r
+ stxv vs24 , 0(CO)\r
+ stxv vs25 , 16(CO) \r
+ stxv vs26 , 32(CO)\r
+ stxv vs27 , 48(CO) \r
+ stxv vs28 , 0(T1)\r
+ stxv vs29 , 16(T1) \r
+ stxv vs30 , 32(T1)\r
+ stxv vs31 , 48(T1) \r
+ addi CO, CO, 64\r
+.endm\r
+\r
+/* macros for N=2 and M=4\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x4\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+.endm\r
+\r
+\r
+.macro LOAD2x4 \r
+ LOAD2x4O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD2x4O OffsetA,OffsetB\r
+ lxv vs24, (\OffsetB+0)(BO)\r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ lxv vs1, (\OffsetA+16)(AO)\r
+ xxperm vs26, vs24, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+\r
+\r
+.macro END2x4_NORMAL\r
+ END2x4 AO,BO,32,16\r
+.endm\r
+\r
+\r
+.macro END2x4_WITHOUT_ADD\r
+ END2x4 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x4 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+.endm\r
+\r
+\r
+.macro LOAD2x4_2\r
+ LOAD2x4_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD2x4_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetB)(BO)\r
+ lxv vs24, (16+\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs5, (16+\OffsetA)(AO)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxperm vs26, vs24, permute_mask\r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ lxv vs0, (32+\OffsetA)(AO)\r
+ lxv vs1, (32+16+\OffsetA)(AO) \r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+\r
+\r
+.macro END2x4_2 \r
+ /*for load2 offset will be 64 and 32*/\r
+ KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+.if \Complete==0 \r
+ lxv vs8, DISP4(\Index,\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs36, vs4,vs9\r
+ xvmaddasp vs37, vs5,vs9\r
+ xvmaddasp vs44, vs4,vs11\r
+ xvmaddasp vs45, vs5,vs11\r
+.if \Complete==0\r
+ xxperm vs10, vs8, permute_mask \r
+ xxpermdi vs9, vs8, vs8,2 \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs11, vs10, vs10,2 \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.if \Complete==0\r
+ lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs37, vs1,vs25\r
+ xvmaddasp vs44, vs0,vs27\r
+ xvmaddasp vs45, vs1,vs27\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask\r
+ xxpermdi vs25, vs24, vs24,2 \r
+.endif \r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) \r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP4(\Index,32)\r
+ addi \AREG, \AREG, DISP8(\Index,64) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL2x4\r
+ LOAD2x4\r
+ END2x4 AO, BO, 32,16\r
+.endm\r
+\r
+\r
+.macro SAVE2x4\r
+ add T1, CO ,LDC \r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+ lxv vs25 , 16(CO)\r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 0(T1)\r
+ lxv vs27 , 16(T1)\r
+#endif \r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask\r
+ xxperm vs8,vs36,permute_mask\r
+ xxperm vs12,vs44,permute_mask\r
+ xxperm vs9,vs37,permute_mask\r
+ xxperm vs13,vs45,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5\r
+ AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12\r
+ AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1\r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART1 vs37,vs45,vs10,vs11\r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART2 vs37,vs45,vs10,vs11\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs2,vs3, save_permute_1\r
+ xxperm vs8,vs9, save_permute_1\r
+ xxperm vs10,vs11, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,2\r
+ xxpermdi vs3,vs10,vs2,2 \r
+ xxpermdi vs9,vs0,vs8,2\r
+ xxpermdi vs11,vs2,vs10,2 \r
+ xvaddsp vs24,vs24,vs1\r
+ xvaddsp vs25,vs25,vs3 \r
+ xvaddsp vs26,vs26,vs9\r
+ xvaddsp vs27,vs27,vs11 \r
+#else\r
+ xxpermdi vs24,vs8,vs0,2\r
+ xxpermdi vs25,vs10,vs2,2\r
+ xxpermdi vs26,vs0,vs8,2\r
+ xxpermdi vs27,vs2,vs10,2 \r
+#endif\r
+ stxv vs24 , 0(CO)\r
+ stxv vs25 , 16(CO)\r
+ stxv vs26 , 0(T1)\r
+ stxv vs27 , 16(T1)\r
+ addi CO, CO, 32\r
+.endm\r
+\r
+/* macros for N=2 and M=2\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x2\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs44, vs44, vs44\r
+.endm\r
+\r
+\r
+.macro LOAD2x2 \r
+ LOAD2x2O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD2x2O OffsetA,OffsetB\r
+ lxv vs24, (\OffsetA+0)(AO)\r
+ lxv vs0, (\OffsetB+0)(BO)\r
+ xxperm vs26, vs24, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+\r
+\r
+.macro END2x2_NORMAL\r
+ END2x2 AO,BO,16,16\r
+.endm\r
+\r
+\r
+.macro END2x2_WITHOUT_ADD\r
+ END2x2 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x2 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs44, vs0,vs27\r
+.endm\r
+\r
+\r
+.macro LOAD2x2_2\r
+ LOAD2x2_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD2x2_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetA)(AO) \r
+ lxv vs24, (16+\OffsetA)(AO) \r
+ lxv vs4, (0+\OffsetB)(BO)\r
+ lxv vs0, (16+\OffsetB)(BO)\r
+ xxperm vs10, vs8, permute_mask\r
+ xxpermdi vs9, vs8, vs8,2 \r
+ xxperm vs26, vs24, permute_mask\r
+ xxpermdi vs25, vs24, vs24,2 \r
+ xxpermdi vs11, vs10, vs10,2 \r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endm\r
+\r
+\r
+.macro END2x2_2 \r
+ /*for load2 offset will be 32 and 32*/\r
+ KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+.if \Complete==0 \r
+ lxv vs8, DISP4(\Index,\OffsetA)(\AREG) \r
+.endif \r
+ xvmaddasp vs36, vs4,vs9\r
+ xvmaddasp vs44, vs4,vs11\r
+.if \Complete==0 \r
+ xxperm vs10, vs8, permute_mask \r
+ xxpermdi vs9, vs8, vs8,2 \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG)\r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs11, vs10, vs10,2 \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+.if \Complete==0\r
+ lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) \r
+.endif \r
+ xvmaddasp vs36, vs0,vs25\r
+ xvmaddasp vs44, vs0,vs27\r
+.if \Complete==0\r
+ xxperm vs26, vs24, permute_mask \r
+ xxpermdi vs25, vs24, vs24,2 \r
+.endif \r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG)\r
+.endif\r
+\r
+.if \Complete==0\r
+ xxpermdi vs27, vs26, vs26,2 \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA) \r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP4(\Index,32) \r
+ addi \BREG, \BREG, DISP4(\Index,32)\r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL2x2\r
+ LOAD2x2\r
+ END2x2 AO, BO, 16,16\r
+.endm\r
+\r
+\r
+.macro SAVE2x2\r
+ add T1, CO ,LDC \r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO) \r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 0(T1) \r
+#endif \r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ xxperm vs8,vs36,permute_mask\r
+ xxperm vs12,vs44,permute_mask\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1\r
+ MULT_APLHA_PART1 vs36,vs44,vs8,vs9\r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs36,vs44,vs8,vs9\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1\r
+ xxperm vs8,vs9, save_permute_1\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxpermdi vs1,vs8,vs0,0\r
+ xxpermdi vs9,vs0,vs8,3 \r
+ xvaddsp vs24,vs24,vs1\r
+ xvaddsp vs26,vs26,vs9 \r
+#else\r
+ xxpermdi vs24,vs8,vs0,0\r
+ xxpermdi vs26,vs0,vs8,3 \r
+#endif\r
+ stxv vs24 , 0(CO) \r
+ stxv vs26 , 0(T1)\r
+ addi CO, CO, 16\r
+.endm\r
+\r
+/* macros for N=2 and M=1\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x1\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs40, vs40, vs40\r
+.endm\r
+\r
+\r
+.macro LOAD2x1 \r
+ LOAD2x1O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD2x1O OffsetA,OffsetB\r
+ lxsd v4, (\OffsetA+0)(AO) \r
+ lxv vs0, (\OffsetB+0)(BO)\r
+ xxspltd vs24,vs36,0\r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+\r
+\r
+.macro END2x1_NORMAL\r
+ END2x1 AO,BO,8,16\r
+.endm\r
+\r
+\r
+.macro END2x1_WITHOUT_ADD\r
+ END2x1 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END2x1 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+.endm\r
+\r
+\r
+.macro LOAD2x1_2\r
+ LOAD2x1_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD2x1_2O OffsetA,OffsetB\r
+ lxv vs27, (\OffsetA)(AO) \r
+ lxv vs4, (0+\OffsetB)(BO)\r
+ lxv vs0, (16+\OffsetB)(BO)\r
+ xxspltd vs8,vs27,1\r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs10, vs8, permute_mask \r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+\r
+\r
+.macro END2x1_2 \r
+ /*for load2 offset will be 16 and 32*/\r
+ KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+.if \Complete==0 \r
+ lxv vs27, DISP2(\Index,\OffsetA)(\AREG) \r
+ xxspltd vs8,vs27,1 \r
+.endif \r
+.if \Complete==0 \r
+ lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG)\r
+.endif\r
+\r
+.if \Complete==0 \r
+ xxperm vs10, vs8, permute_mask \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+.if \Complete==0 \r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs26, vs24, permute_mask \r
+.endif \r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG)\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA) \r
+ addi \BREG, \BREG, DISP4(\Index,\OffsetB)\r
+.else\r
+ addi \AREG, \AREG, DISP2(\Index,16) \r
+ addi \BREG, \BREG, DISP4(\Index,32)\r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL2x1\r
+ LOAD2x1\r
+ END2x1 AO, BO, 8,16\r
+.endm\r
+\r
+\r
+.macro SAVE2x1\r
+ add T1, CO ,LDC \r
+#ifndef TRMMKERNEL \r
+ lxsd v4 , 0(CO) \r
+#endif\r
+#ifndef TRMMKERNEL \r
+ lxsd v5 , 0(T1) \r
+#endif \r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, save_permute_1 \r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xxspltd vs1,vs0,0\r
+ xxspltd vs3,vs0,1\r
+ /*--v4==vs36 v5==vs37---*/\r
+ xvaddsp vs36,vs36,vs1\r
+ xvaddsp vs37,vs37,vs3 \r
+#else \r
+ /*--v4==vs36 v5==vs37---*/\r
+ xxspltd vs36,vs0,0\r
+ xxspltd vs37,vs0,1\r
+#endif\r
+ stxsd v4 , 0(CO) \r
+ stxsd v5 , 0(T1) \r
+ addi CO, CO, 8\r
+.endm\r
+\r
+/* macros for N=1 and M=8\r
+**********************************************************************************************/\r
+\r
+.macro Zero1x8\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+.endm\r
+\r
+\r
+.macro LOAD1x8 \r
+ LOAD1x8O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD1x8O OffsetA,OffsetB\r
+ lxsd vs4, (\OffsetB+0)(BO) \r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ lxv vs1, (\OffsetA+16)(AO)\r
+ lxv vs2, (\OffsetA+32)(AO)\r
+ lxv vs3, (\OffsetA+48)(AO) \r
+ xxspltd vs24,vs36,0\r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+\r
+\r
+.macro END1x8_NORMAL\r
+ END1x8 AO,BO,64,8\r
+.endm\r
+\r
+\r
+.macro END1x8_WITHOUT_ADD\r
+ END1x8 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x8 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs34, vs2,vs24 \r
+ xvmaddasp vs35, vs3,vs24 \r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+ xvmaddasp vs42, vs2,vs26 \r
+ xvmaddasp vs43, vs3,vs26\r
+.endm\r
+\r
+\r
+.macro LOAD1x8_2\r
+ LOAD1x8_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD1x8_2O OffsetA,OffsetB\r
+ lxv vs27, (\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs5, (16+\OffsetA)(AO)\r
+ xxspltd vs8,vs27,1\r
+ xxspltd vs24,vs27,0 \r
+ lxv vs6, (32+\OffsetA)(AO)\r
+ lxv vs7, (48+\OffsetA)(AO) \r
+ lxv vs0, (64+\OffsetA)(AO)\r
+ lxv vs1, (64+16+\OffsetA)(AO) \r
+ lxv vs2, (64+32+\OffsetA)(AO)\r
+ lxv vs3, (64+48+\OffsetA)(AO)\r
+ xxperm vs10, vs8, permute_mask \r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+ \r
+\r
+.macro END1x8_2 \r
+ /*for load2 offset will be 128 and 16*/\r
+ KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.if \Complete==0 \r
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+.if \Complete==0 \r
+ lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+ xvmaddasp vs34, vs6,vs8 \r
+ xvmaddasp vs35, vs7,vs8\r
+ xvmaddasp vs42, vs6,vs10\r
+ xvmaddasp vs43, vs7,vs10\r
+.if \Complete==0\r
+ lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) \r
+.endif \r
+.if \Complete==0 \r
+ xxspltd vs8,vs27,1 \r
+ xxperm vs10, vs8, permute_mask \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG)\r
+ lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) \r
+.endif\r
+\r
+ xvmaddasp vs34, vs2,vs24\r
+ xvmaddasp vs35, vs3,vs24 \r
+ xvmaddasp vs42, vs2,vs26\r
+ xvmaddasp vs43, vs3,vs26\r
+.if \Complete==0\r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs26, vs24, permute_mask \r
+.endif \r
+.if \Complete==0\r
+ lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG)\r
+ lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP16(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP2(\Index,16)\r
+ addi \AREG, \AREG, DISP16(\Index,128) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL1x8\r
+ LOAD1x8\r
+ END1x8 AO, BO, 64,8\r
+.endm\r
+\r
+\r
+.macro SAVE1x8\r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+ lxv vs25 , 16(CO)\r
+#endif\r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+#ifndef TRMMKERNEL \r
+ lxv vs26 , 32(CO)\r
+ lxv vs27 , 48(CO)\r
+#endif \r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask\r
+ xxperm vs2,vs34,permute_mask\r
+ xxperm vs6,vs42,permute_mask\r
+ xxperm vs3,vs35,permute_mask\r
+ xxperm vs7,vs43,permute_mask \r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5\r
+ AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6\r
+ AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 \r
+ /*inner reverse save_permute and store vs28 */\r
+ xxpermdi vs28,save_permute_1,save_permute_1,2\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART1 vs34,vs42,vs4,vs5\r
+ MULT_APLHA_PART1 vs35,vs43,vs6,vs7 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs34,vs42,vs4,vs5\r
+ MULT_APLHA_PART2 vs35,vs43,vs6,vs7 \r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, vs28\r
+ xxperm vs2,vs3, vs28\r
+ xxperm vs4,vs5, vs28\r
+ xxperm vs6,vs7, vs28 \r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xvaddsp vs24,vs24,vs0\r
+ xvaddsp vs25,vs25,vs2\r
+ xvaddsp vs26,vs26,vs4\r
+ xvaddsp vs27,vs27,vs6\r
+ stxv vs24 , 0(CO)\r
+ stxv vs25 , 16(CO) \r
+ stxv vs26 , 32(CO)\r
+ stxv vs27 , 48(CO) \r
+#else\r
+/* reconstruct r,i pairs*/\r
+ stxv vs0 , 0(CO)\r
+ stxv vs2 , 16(CO) \r
+ stxv vs4 , 32(CO)\r
+ stxv vs6 , 48(CO) \r
+#endif\r
+ addi CO, CO, 64\r
+.endm\r
+\r
+/* macros for N=1 and M=4\r
+**********************************************************************************************/\r
+\r
+.macro Zero1x4\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+.endm\r
+\r
+\r
+.macro LOAD1x4 \r
+ LOAD1x4O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD1x4O OffsetA,OffsetB\r
+ lxsd vs4, (\OffsetB+0)(BO) \r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ lxv vs1, (\OffsetA+16)(AO)\r
+ xxspltd vs24,vs36,0\r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+\r
+\r
+.macro END1x4_NORMAL\r
+ END1x4 AO,BO,32,8\r
+.endm\r
+\r
+\r
+.macro END1x4_WITHOUT_ADD\r
+ END1x4 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x4 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.endm\r
+\r
+\r
+.macro LOAD1x4_2\r
+ LOAD1x4_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD1x4_2O OffsetA,OffsetB\r
+ lxv vs27, (\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs5, (16+\OffsetA)(AO)\r
+ xxspltd vs8,vs27,1\r
+ xxspltd vs24,vs27,0 \r
+ lxv vs0, (32+\OffsetA)(AO)\r
+ lxv vs1, (32+16+\OffsetA)(AO) \r
+ xxperm vs10, vs8, permute_mask \r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+ \r
+\r
+.macro END1x4_2 \r
+ /*for load2 offset will be 64 and 16*/\r
+ KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.if \Complete==0 \r
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs33, vs5,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+ xvmaddasp vs41, vs5,vs10\r
+.if \Complete==0 \r
+ lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG)\r
+ lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \Complete==0 \r
+ xxspltd vs8,vs27,1 \r
+ xxperm vs10, vs8, permute_mask \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs33, vs1,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+ xvmaddasp vs41, vs1,vs26\r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG)\r
+ lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) \r
+.endif\r
+\r
+.if \Complete==0\r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs26, vs24, permute_mask \r
+.endif \r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP8(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP2(\Index,16)\r
+ addi \AREG, \AREG, DISP8(\Index,64) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL1x4\r
+ LOAD1x4\r
+ END1x4 AO, BO, 32,8\r
+.endm\r
+\r
+\r
+.macro SAVE1x4\r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+ lxv vs25 , 16(CO)\r
+#endif\r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ xxperm vs1,vs33,permute_mask\r
+ xxperm vs5,vs41,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5\r
+ /*inner reverse save_permute and store vs28 */\r
+ xxpermdi vs28,save_permute_1,save_permute_1,2\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART1 vs33,vs41,vs2,vs3 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs33,vs41,vs2,vs3 \r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, vs28\r
+ xxperm vs2,vs3, vs28\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xvaddsp vs24,vs24,vs0\r
+ xvaddsp vs25,vs25,vs2\r
+ stxv vs24 , 0(CO)\r
+ stxv vs25 , 16(CO) \r
+#else\r
+/* reconstruct r,i pairs*/\r
+ stxv vs0 , 0(CO)\r
+ stxv vs2 , 16(CO) \r
+#endif\r
+ addi CO, CO, 32\r
+.endm\r
+\r
+/* macros for N=1 and M=2\r
+**********************************************************************************************/\r
+\r
+.macro Zero1x2\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs40, vs40, vs40\r
+.endm\r
+\r
+\r
+.macro LOAD1x2 \r
+ LOAD1x2O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD1x2O OffsetA,OffsetB\r
+ lxsd vs4, (\OffsetB+0)(BO) \r
+ lxv vs0, (\OffsetA+0)(AO)\r
+ xxspltd vs24,vs36,0\r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+\r
+\r
+.macro END1x2_NORMAL\r
+ END1x2 AO,BO,16,8\r
+.endm\r
+\r
+\r
+.macro END1x2_WITHOUT_ADD\r
+ END1x2 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x2 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+.endm\r
+\r
+\r
+.macro LOAD1x2_2\r
+ LOAD1x2_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD1x2_2O OffsetA,OffsetB\r
+ lxv vs27, (\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO)\r
+ lxv vs0, (16+\OffsetA)(AO)\r
+ xxspltd vs8,vs27,1\r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs10, vs8, permute_mask \r
+ xxperm vs26, vs24, permute_mask \r
+.endm\r
+ \r
+\r
+.macro END1x2_2 \r
+ /*for load2 offset will be 32 and 16*/\r
+ KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+.if \Complete==0 \r
+ lxv vs27, DISP2(\Index,\OffsetB)(\BREG)\r
+.endif \r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+.if \Complete==0 \r
+ lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \Complete==0 \r
+ xxspltd vs8,vs27,1 \r
+ xxperm vs10, vs8, permute_mask \r
+.endif \r
+ xvmaddasp vs32, vs0,vs24\r
+ xvmaddasp vs40, vs0,vs26\r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG)\r
+.endif\r
+\r
+.if \Complete==0\r
+ xxspltd vs24,vs27,0 \r
+ xxperm vs26, vs24, permute_mask \r
+.endif \r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP4(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP2(\Index,16)\r
+ addi \AREG, \AREG, DISP4(\Index,32) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL1x2\r
+ LOAD1x2\r
+ END1x2 AO, BO, 16,8\r
+.endm\r
+\r
+\r
+.macro SAVE1x2\r
+#ifndef TRMMKERNEL \r
+ lxv vs24 , 0(CO)\r
+#endif\r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ /*inner reverse save_permute and store vs28 */\r
+ xxpermdi vs28,save_permute_1,save_permute_1,2\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs0,vs1 \r
+ MULT_APLHA_PART2 vs32,vs40,vs0,vs1 \r
+/* reconstruct r,i pairs*/\r
+ xxperm vs0,vs1, vs28\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xvaddsp vs24,vs24,vs0\r
+ stxv vs24 , 0(CO)\r
+#else\r
+/* reconstruct r,i pairs*/\r
+ stxv vs0 , 0(CO)\r
+#endif\r
+ addi CO, CO, 16\r
+.endm\r
+\r
+/* macros for N=1 and M=1\r
+**********************************************************************************************/\r
+.macro Zero1x1\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs40, vs40, vs40\r
+.endm\r
+\r
+\r
+.macro LOAD1x1 \r
+ LOAD1x1O 0,0 \r
+.endm\r
+\r
+\r
+.macro LOAD1x1O OffsetA,OffsetB\r
+ lxsd v4, (\OffsetB+0)(BO) \r
+ lxsd v5, (\OffsetA+0)(AO)\r
+ xxperm vs38, vs36, permute_mask \r
+.endm\r
+\r
+\r
+.macro END1x1_NORMAL\r
+ END1x1 AO,BO,8,8\r
+.endm\r
+\r
+\r
+.macro END1x1_WITHOUT_ADD\r
+ END1x1 AO,BO,0,0\r
+.endm\r
+\r
+\r
+.macro END1x1 AREG, BREG, OffsetA, OffsetB\r
+.if \OffsetB != 0\r
+ addi \BREG, \BREG, \OffsetB\r
+.endif\r
+\r
+.if \OffsetA != 0\r
+ addi \AREG, \AREG, \OffsetA\r
+.endif\r
+\r
+ xvmaddasp vs32, vs37,vs36\r
+ xvmaddasp vs40, vs37,vs38\r
+.endm\r
+\r
+\r
+.macro LOAD1x1_2\r
+ LOAD1x1_2O 0,0\r
+.endm\r
+ \r
+\r
+.macro LOAD1x1_2O OffsetA,OffsetB\r
+ lxv vs8, (\OffsetB)(BO)\r
+ lxv vs4, (0+\OffsetA)(AO) \r
+ xxperm vs10, vs8, permute_mask \r
+.endm\r
+ \r
+\r
+.macro END1x1_2 \r
+ /*for load2 offset will be 16 and 16*/\r
+ KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast\r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 \r
+.endm\r
+\r
+\r
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+ \r
+ xvmaddasp vs32, vs4,vs8\r
+ xvmaddasp vs40, vs4,vs10\r
+.if \Complete==0 \r
+ lxv vs8, DISP2(\Index,\OffsetB)(\BREG)\r
+ lxv vs4, DISP2(\Index,\OffsetB)(\AREG)\r
+ xxperm vs10, vs8, permute_mask \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \BREG, \BREG, DISP2(\Index,\OffsetB)\r
+ addi \AREG, \AREG, DISP2(\Index,\OffsetA) \r
+.else\r
+ addi \BREG, \BREG, DISP2(\Index,16)\r
+ addi \AREG, \AREG, DISP2(\Index,16) \r
+.endif\r
+\r
+.endif \r
+.endm\r
+\r
+\r
+.macro KERNEL1x1\r
+ LOAD1x1\r
+ END1x1 AO, BO, 8,8\r
+.endm\r
+\r
+\r
+.macro SAVE1x1\r
+#ifndef TRMMKERNEL \r
+ lxsd v4 , 0(CO)\r
+#endif\r
+ /*aggregate x2*/\r
+ xxpermdi vs33,vs32,vs32,2\r
+ xxpermdi vs41,vs40,vs40,2 \r
+ xvaddsp vs32,vs32,vs33\r
+ xvaddsp vs40,vs40,vs41\r
+\r
+ xxperm vs0,vs32,permute_mask\r
+ xxperm vs4,vs40,permute_mask\r
+ AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4\r
+ /*inner reverse save_permute and store vs28 */\r
+ xxpermdi vs28,save_permute_1,save_permute_1,2\r
+ /*VSINRR,VSINII,VSOUT1,VSOUT2*/\r
+ MULT_APLHA_PART1 vs32,vs40,vs37,vs1 \r
+ MULT_APLHA_PART2 vs32,vs40,vs37,vs1 \r
+\r
+/* reconstruct r,i pairs*/\r
+ xxperm vs37,vs1, vs28 \r
+\r
+#ifndef TRMMKERNEL\r
+ /* add */\r
+ xvaddsp vs36,vs36,vs37\r
+ stxsd v4 , 0(CO)\r
+#else\r
+\r
+/* vs37 is v5 */\r
+ stxsd v5 , 0(CO)\r
+#endif\r
+ addi CO, CO, 8\r
+.endm\r
+\r
+ \r
+ \r
+\r
+/****************************TRMM POINTER REFRESH MACROSES*************************/\r
+\r
+\r
+.macro SHIFT_REG REG1,REG2,SHIFT_VAL\r
+ .if \SHIFT_VAL==16 \r
+ slwi \REG1, \REG2, 7 \r
+ .elseif \SHIFT_VAL==8 \r
+ slwi \REG1, \REG2, 6 \r
+ .elseif \SHIFT_VAL==4\r
+ slwi \REG1, \REG2, 5 \r
+ .elseif \SHIFT_VAL==2\r
+ slwi \REG1, \REG2, 4 \r
+ .elseif \SHIFT_VAL==1\r
+ slwi \REG1, \REG2, 3 \r
+ .endif\r
+.endm\r
+\r
+/*\r
+//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+// ptrbb = bb;\r
+// #else\r
+// ptrba += off*8;\r
+// ptrbb = bb + off*4;\r
+// #endif\r
+*/\r
+.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B\r
+ #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+ /* ptrbb = bb;*/\r
+ mr \PTR_B,\B_VAL /* refresh BPOINT */\r
+\r
+ #else\r
+ /*\r
+ // ptrba =ptrba+ off*C_A;\r
+ // ptrbb = bb + off*C_B; \r
+ */\r
+ SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */\r
+ SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */\r
+ add \PTR_B, \B_VAL , T4 /* Add values to BO */\r
+ add \PTR_A, \PTR_A, T2 /* Add values to AO */\r
+ #endif \r
+.endm\r
+\r
+\r
+/*\r
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+// temp = bk-off;\r
+// #elif defined(LEFT)\r
+// temp = off+8; // number of values in A\r
+// #else\r
+// temp = off+4; // number of values in B\r
+// #endif\r
+*/\r
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B\r
+ #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ /* temp = bk-off;*/\r
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL\r
+\r
+ #elif defined(LEFT)\r
+ /* temp = off+INCR_A; // number of values in A */\r
+ addi \TEMP_BK, \OFF_VAL, \INCR_A\r
+ #else\r
+ /* temp = off+INCR_B // number of values in B*/\r
+ addi \TEMP_BK,\OFF_VAL, \INCR_B\r
+ #endif\r
+\r
+.endm\r
+/*\r
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+// temp = bk - off;\r
+// #ifdef LEFT\r
+// temp -= 8; // number of values in A\r
+// #else\r
+// temp -= 4; // number of values in B\r
+// #endif\r
+// ptrba += temp*8;\r
+// ptrbb += temp*4;\r
+// #endif\r
+\r
+// #ifdef LEFT\r
+// off += 8; // number of values in A\r
+// #endif\r
+*/\r
+ \r
+\r
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B\r
+\r
+ #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))\r
+ /*temp = bk - off;*/\r
+ sub \TEMP_BK,\BK_VAL,\OFF_VAL\r
+ #ifdef LEFT\r
+ /*temp -= 8; // number of values in A*/\r
+ addi \TEMP_BK,\TEMP_BK,-\C_A\r
+ #else\r
+ /*temp -= 4; // number of values in B*/\r
+ addi \TEMP_BK,\TEMP_BK,-\C_B \r
+ #endif\r
+ /*ptrba += temp*C_A;\r
+ ptrbb += temp*C_B;*/ \r
+ SHIFT_REG T4,\TEMP_BK,\C_A\r
+ SHIFT_REG T2,\TEMP_BK,\C_B\r
+ add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ \r
+ add \PTR_B, \PTR_B,T2 \r
+\r
+ #endif\r
+\r
+ #ifdef LEFT\r
+ /*off += 8; // number of values in A*/\r
+ addi \OFF_VAL,\OFF_VAL,\C_A\r
+ #endif\r
+.endm
\ No newline at end of file