--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+#define MY_ALIGN .align 3\r
+\r
+ srawi. J, N, 1\r
+ ble ZGEMM_L2_END\r
+\r
+ZGEMM_L2_BEGIN:\r
+\r
+ mr BO, B\r
+ mr BBO, BBUFFER\r
+ srawi. T1, K, 2\r
+ ble ZGEMM_L2_COPYB1\r
+\r
+ZGEMM_L2_COPYB8:\r
+\r
+ addi T2, PRE, 128\r
+ dcbt BO, PRE\r
+ dcbtst BBO, PRE\r
+ dcbtst BBO, T2\r
+ ZCOPYB_8\r
+ addic. T1, T1, -1\r
+\r
+ bgt ZGEMM_L2_COPYB8\r
+\r
+ZGEMM_L2_COPYB1:\r
+\r
+ andi. T1, K, 3\r
+ ble ZGEMM_L2_COPYB_END\r
+\r
+ZGEMM_L2_COPYB_LOOP:\r
+\r
+ ZCOPYB_2\r
+ addic. T1, T1, -1\r
+\r
+ bgt ZGEMM_L2_COPYB_LOOP\r
+\r
+ZGEMM_L2_COPYB_END:\r
+\r
+ mr CO, C\r
+ mr AO, A\r
+ slwi T1, LDC , 1\r
+ add C, C, T1\r
+ srawi. I, M, 3\r
+ ble ZGEMM_L2x8_END\r
+\r
+ZGEMM_L2x8_BEGIN:\r
+\r
+\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 5 /**(K-1) % 32x */ \r
+ ZERO2x8 \r
+ ble ZGEMM_L2x8_SUB0\r
+ \r
+\r
+ZGEMM_L2x8_LOOP_START:\r
+\r
+ LOAD2x8 0 \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ li T4, 2048\r
+ li T5, 2048+512\r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L2x8_LOOP:\r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL2x8_L 128,64,0,0\r
+ KERNEL2x8_L 128,64,1,0\r
+ dcbt AO, T2 \r
+ KERNEL2x8_L 128,64,2,0\r
+ KERNEL2x8_L 128,64,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL2x8_L 128,64,4,0\r
+ KERNEL2x8_L 128,64,5,0\r
+ dcbt AO, T4 \r
+ KERNEL2x8_L 128,64,6,0\r
+ KERNEL2x8_L 128,64,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL2x8_L 128,64,8,0\r
+ KERNEL2x8_L 128,64,9,0\r
+ KERNEL2x8_L 128,64,10,0\r
+ KERNEL2x8_L 128,64,11,0 \r
+ dcbt BO, T4\r
+ KERNEL2x8_L 128,64,12,0\r
+ KERNEL2x8_L 128,64,13,0\r
+ KERNEL2x8_L 128,64,14,0\r
+ KERNEL2x8_L 128,64,15,1 \r
+ bdnz ZGEMM_L2x8_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L2x8_LOOP_END:\r
+ END2x8 AO, BO, 128, 64 \r
+ \r
+ b ZGEMM_L2x8_SUB1\r
+ \r
+ZGEMM_L2x8_SUB0:\r
+\r
+ andi. L, K, 63\r
+ \r
+ b ZGEMM_L2x8_SUB2\r
+\r
+ZGEMM_L2x8_SUB1:\r
+\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L2x8_SAVE\r
+\r
+ZGEMM_L2x8_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L2x8_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L2x8_SUB2_LOOP:\r
+ LOAD2x8 0 \r
+ KERNEL2x8_L 128,64, 0,0\r
+ KERNEL2x8_L 128,64, 1,0\r
+ KERNEL2x8_L 128,64, 2,0\r
+ KERNEL2x8_E 128,64, 3,1\r
+ bdnz ZGEMM_L2x8_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L2x8_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L2x8_SUB2_2\r
+ LOAD2x8 0 \r
+ KERNEL2x8_L 128,64, 0,0\r
+ KERNEL2x8_E 128,64, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L2x8_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L2x8_SUB2_1\r
+ LOAD2x8 0 \r
+ KERNEL2x8_E 128,64, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L2x8_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L2x8_SAVE \r
+ KERNEL2x8 \r
+\r
+/* addic. L, L, -1\r
+ bgt ZGEMM_L2x8_SUB2_1*/\r
+\r
+ZGEMM_L2x8_SAVE:\r
+\r
+ SAVE2x8\r
+\r
+ addic. I, I, -1\r
+ bgt ZGEMM_L2x8_BEGIN\r
+\r
+ZGEMM_L2x8_END:\r
+\r
+ZGEMM_L2x4_BEGIN:\r
+\r
+ andi. T2, M, 7\r
+ ble ZGEMM_L2x1_END\r
+\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L2x4_END\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 4 /**(K-1) % 16x */ \r
+ ZERO2x4 \r
+ ble ZGEMM_L2x4_SUB0 \r
+\r
+ZGEMM_L2x4_LOOP_START:\r
+ LOAD2x4 0 \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L2x4_LOOP: \r
+ KERNEL2x4_L 64,64,0,0\r
+ KERNEL2x4_L 64,64,1,0 \r
+ KERNEL2x4_L 64,64,2,0\r
+ KERNEL2x4_L 64,64,3,0 \r
+ KERNEL2x4_L 64,64,4,0\r
+ KERNEL2x4_L 64,64,5,0 \r
+ KERNEL2x4_L 64,64,6,0\r
+ KERNEL2x4_L 64,64,7,1 \r
+ bdnz ZGEMM_L2x4_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L2x4_LOOP_END:\r
+ END2x4 AO, BO, 64, 64 \r
+ \r
+ b ZGEMM_L2x4_SUB1\r
+ \r
+ZGEMM_L2x4_SUB0:\r
+\r
+ andi. L, K, 31\r
+ \r
+ b ZGEMM_L2x4_SUB2\r
+\r
+ZGEMM_L2x4_SUB1:\r
+\r
+ andi. L, T1, 15\r
+ ble ZGEMM_L2x4_SAVE\r
+\r
+ZGEMM_L2x4_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L2x4_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L2x4_SUB2_LOOP:\r
+ LOAD2x4 0 \r
+ KERNEL2x4_L 64,64, 0,0\r
+ KERNEL2x4_L 64,64, 1,0\r
+ KERNEL2x4_L 64,64, 2,0\r
+ KERNEL2x4_E 64,64, 3,1\r
+ bdnz ZGEMM_L2x4_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L2x4_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L2x4_SUB2_2\r
+ LOAD2x4 0 \r
+ KERNEL2x4_L 64,64, 0,0\r
+ KERNEL2x4_E 64,64, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L2x4_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L2x4_SUB2_1\r
+ LOAD2x4 0 \r
+ KERNEL2x4_E 64,64, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L2x4_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L2x4_SAVE \r
+ KERNEL2x4 \r
+\r
+ZGEMM_L2x4_SAVE:\r
+\r
+ SAVE2x4\r
+\r
+ZGEMM_L2x4_END:\r
+\r
+ZGEMM_L2x2_BEGIN:\r
+\r
+\r
+ andi. T1, M, 2\r
+ ble ZGEMM_L2x2_END\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 4 /**(K-1) % 16x */ \r
+ ZERO2x2 \r
+ ble ZGEMM_L2x2_SUB0 \r
+\r
+ZGEMM_L2x2_LOOP_START:\r
+ LOAD2x2 0 \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L2x2_LOOP: \r
+ KERNEL2x2_L 32,64,0,0\r
+ KERNEL2x2_L 32,64,1,0 \r
+ KERNEL2x2_L 32,64,2,0\r
+ KERNEL2x2_L 32,64,3,0 \r
+ KERNEL2x2_L 32,64,4,0\r
+ KERNEL2x2_L 32,64,5,0 \r
+ KERNEL2x2_L 32,64,6,0\r
+ KERNEL2x2_L 32,64,7,1 \r
+ bdnz ZGEMM_L2x2_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L2x2_LOOP_END:\r
+ END2x2 AO, BO, 32, 64 \r
+ \r
+ b ZGEMM_L2x2_SUB1\r
+ \r
+ZGEMM_L2x2_SUB0:\r
+\r
+ andi. L, K, 31\r
+ \r
+ b ZGEMM_L2x2_SUB2\r
+\r
+ZGEMM_L2x2_SUB1:\r
+\r
+ andi. L, T1, 15\r
+ ble ZGEMM_L2x2_SAVE\r
+\r
+ZGEMM_L2x2_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L2x2_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L2x2_SUB2_LOOP:\r
+ LOAD2x2 0 \r
+ KERNEL2x2_L 32,64, 0,0\r
+ KERNEL2x2_L 32,64, 1,0\r
+ KERNEL2x2_L 32,64, 2,0\r
+ KERNEL2x2_E 32,64, 3,1\r
+ bdnz ZGEMM_L2x2_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L2x2_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L2x2_SUB2_2\r
+ LOAD2x2 0 \r
+ KERNEL2x2_L 32,64, 0,0\r
+ KERNEL2x2_E 32,64, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L2x2_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L2x2_SUB2_1\r
+ LOAD2x2 0 \r
+ KERNEL2x2_E 32,64, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L2x2_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L2x2_SAVE \r
+ KERNEL2x2 \r
+ZGEMM_L2x2_SAVE:\r
+\r
+ SAVE2x2\r
+\r
+ZGEMM_L2x2_END:\r
+\r
+ZGEMM_L2x1_BEGIN:\r
+\r
+\r
+ andi. T1, M, 1\r
+ ble ZGEMM_L2x1_END\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 4 /**(K-1) % 16x */ \r
+ ZERO2x1 \r
+ ble ZGEMM_L2x1_SUB0 \r
+\r
+ZGEMM_L2x1_LOOP_START:\r
+\r
+ LOAD2x1 0 \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L2x1_LOOP: \r
+ KERNEL2x1_L 16,64,0,0\r
+ KERNEL2x1_L 16,64,1,0 \r
+ KERNEL2x1_L 16,64,2,0\r
+ KERNEL2x1_L 16,64,3,0 \r
+ KERNEL2x1_L 16,64,4,0\r
+ KERNEL2x1_L 16,64,5,0 \r
+ KERNEL2x1_L 16,64,6,0\r
+ KERNEL2x1_L 16,64,7,1 \r
+ bdnz ZGEMM_L2x1_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L2x1_LOOP_END:\r
+ END2x1 AO, BO, 16, 64 \r
+ \r
+ b ZGEMM_L2x1_SUB1\r
+ \r
+ZGEMM_L2x1_SUB0:\r
+\r
+ andi. L, K, 31\r
+ \r
+ b ZGEMM_L2x1_SUB2\r
+\r
+ZGEMM_L2x1_SUB1:\r
+\r
+ andi. L, T1, 15\r
+ ble ZGEMM_L2x1_SAVE\r
+\r
+ZGEMM_L2x1_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L2x1_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L2x1_SUB2_LOOP:\r
+ LOAD2x1 0 \r
+ KERNEL2x1_L 16,64, 0,0\r
+ KERNEL2x1_L 16,64, 1,0\r
+ KERNEL2x1_L 16,64, 2,0\r
+ KERNEL2x1_E 16,64, 3,1\r
+ bdnz ZGEMM_L2x1_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L2x1_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L2x1_SUB2_2\r
+ LOAD2x1 0 \r
+ KERNEL2x1_L 16,64, 0,0\r
+ KERNEL2x1_E 16,64, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L2x1_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L2x1_SUB2_1\r
+ LOAD2x1 0 \r
+ KERNEL2x1_E 16,64, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L2x1_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L2x1_SAVE \r
+ KERNEL2x1 \r
+\r
+ZGEMM_L2x1_SAVE:\r
+\r
+ SAVE2x1\r
+\r
+ZGEMM_L2x1_END:\r
+\r
+ slwi T1, K, 5\r
+ add B, B, T1\r
+\r
+ addic. J, J, -1\r
+ bgt ZGEMM_L2_BEGIN\r
+\r
+ andi. T2, N, 1\r
+ ble L999\r
+\r
+ZGEMM_L2_END:\r
+\r
+ b ZGEMM_L1_BEGIN\r
+\r
+L999_H1:\r
+\r
+ b L999\r
+\r
+ZGEMM_L1_BEGIN:\r
+ andi. T1, N, 1\r
+ ble ZGEMM_L1_END\r
+\r
+ mr BO, B\r
+ mr BBO, BBUFFER \r
+ srawi. T1, K, 3 /*this time K/8 */\r
+ ble ZGEMM_L1_COPYB1\r
+\r
+ZGEMM_L1_COPYB8:\r
+\r
+ addi T2, PRE, 128\r
+ dcbt BO, PRE\r
+ dcbtst BBO, PRE\r
+ dcbtst BBO, T2\r
+ ZCOPYB_8\r
+ addic. T1, T1, -1\r
+\r
+ bgt ZGEMM_L1_COPYB8\r
+\r
+ZGEMM_L1_COPYB1:\r
+\r
+ andi. T1, K, 7\r
+ ble ZGEMM_L1_COPYB_END\r
+\r
+ZGEMM_L1_COPYB_LOOP:\r
+\r
+ ZCOPYB_1\r
+ addic. T1, T1, -1\r
+\r
+ bgt ZGEMM_L1_COPYB_LOOP\r
+\r
+ZGEMM_L1_COPYB_END:\r
+\r
+ mr CO, C\r
+ mr AO, A\r
+ srawi. I, M, 3\r
+ ble ZGEMM_L1x8_END\r
+\r
+ZGEMM_L1x8_BEGIN:\r
+\r
+\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 5 /**(K-1) % 32x */ \r
+ ZERO1x8 \r
+ ble ZGEMM_L1x8_SUB0\r
+ \r
+\r
+ZGEMM_L1x8_LOOP_START:\r
+\r
+ LOAD1x8 0 \r
+ li T2, 1024\r
+ li T3, 1024+512\r
+ li T4, 2048\r
+ li T5, 2048+512\r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L1x8_LOOP:\r
+ dcbt AO, PRE\r
+ dcbt BO, PRE\r
+ KERNEL1x8_L 128,32,0,0\r
+ KERNEL1x8_L 128,32,1,0\r
+ dcbt AO, T2 \r
+ KERNEL1x8_L 128,32,2,0\r
+ KERNEL1x8_L 128,32,3,0 \r
+ dcbt AO, T3\r
+ dcbt BO, T2\r
+ KERNEL1x8_L 128,32,4,0\r
+ KERNEL1x8_L 128,32,5,0\r
+ dcbt AO, T4 \r
+ KERNEL1x8_L 128,32,6,0\r
+ KERNEL1x8_L 128,32,7,0 \r
+ dcbt AO, T5 \r
+ dcbt BO, T3\r
+ KERNEL1x8_L 128,32,8,0\r
+ KERNEL1x8_L 128,32,9,0\r
+ KERNEL1x8_L 128,32,10,0\r
+ KERNEL1x8_L 128,32,11,0 \r
+ dcbt BO, T4\r
+ KERNEL1x8_L 128,32,12,0\r
+ KERNEL1x8_L 128,32,13,0\r
+ KERNEL1x8_L 128,32,14,0\r
+ KERNEL1x8_L 128,32,15,1 \r
+ bdnz ZGEMM_L1x8_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L1x8_LOOP_END:\r
+ END1x8 AO, BO, 128, 32 \r
+ \r
+ b ZGEMM_L1x8_SUB1\r
+ \r
+ZGEMM_L1x8_SUB0:\r
+\r
+ andi. L, K, 63\r
+ \r
+ b ZGEMM_L1x8_SUB2\r
+\r
+ZGEMM_L1x8_SUB1:\r
+\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x8_SAVE\r
+\r
+ZGEMM_L1x8_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L1x8_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L1x8_SUB2_LOOP:\r
+ LOAD1x8 0 \r
+ KERNEL1x8_L 128,32, 0,0\r
+ KERNEL1x8_L 128,32, 1,0\r
+ KERNEL1x8_L 128,32, 2,0\r
+ KERNEL1x8_E 128,32, 3,1\r
+ bdnz ZGEMM_L1x8_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L1x8_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L1x8_SUB2_2\r
+ LOAD1x8 0 \r
+ KERNEL1x8_L 128,32, 0,0\r
+ KERNEL1x8_E 128,32, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L1x8_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L1x8_SUB2_1\r
+ LOAD1x8 0 \r
+ KERNEL1x8_E 128,32, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L1x8_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L1x8_SAVE \r
+ KERNEL1x8 \r
+\r
+/* addic. L, L, -1\r
+ bgt ZGEMM_L1x8_SUB2_1*/\r
+\r
+ZGEMM_L1x8_SAVE:\r
+\r
+ SAVE1x8\r
+\r
+ addic. I, I, -1\r
+ bgt ZGEMM_L1x8_BEGIN\r
+\r
+ZGEMM_L1x8_END:\r
+\r
+ZGEMM_L1x4_BEGIN:\r
+\r
+ andi. T2, M, 7\r
+ ble ZGEMM_L1x1_END\r
+\r
+ andi. T1, M, 4\r
+ ble ZGEMM_L1x4_END\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 5 /**(K-1) % 16x */ \r
+ ZERO1x4 \r
+ ble ZGEMM_L1x4_SUB0 \r
+\r
+ZGEMM_L1x4_LOOP_START:\r
+ LOAD1x4 0 \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L1x4_LOOP: \r
+ KERNEL1x4_L 64,32,0,0\r
+ KERNEL1x4_L 64,32,1,0 \r
+ KERNEL1x4_L 64,32,2,0\r
+ KERNEL1x4_L 64,32,3,0 \r
+ KERNEL1x4_L 64,32,4,0\r
+ KERNEL1x4_L 64,32,5,0 \r
+ KERNEL1x4_L 64,32,6,0\r
+ KERNEL1x4_L 64,32,7,0 \r
+ KERNEL1x4_L 64,32,8,0\r
+ KERNEL1x4_L 64,32,9,0\r
+ KERNEL1x4_L 64,32,10,0\r
+ KERNEL1x4_L 64,32,11,0 \r
+ KERNEL1x4_L 64,32,12,0\r
+ KERNEL1x4_L 64,32,13,0\r
+ KERNEL1x4_L 64,32,14,0\r
+ KERNEL1x4_L 64,32,15,1 \r
+ bdnz ZGEMM_L1x4_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L1x4_LOOP_END:\r
+ END1x4 AO, BO, 64, 32 \r
+ \r
+ b ZGEMM_L1x4_SUB1\r
+ \r
+ZGEMM_L1x4_SUB0:\r
+\r
+ andi. L, K, 63\r
+ \r
+ b ZGEMM_L1x4_SUB2\r
+\r
+ZGEMM_L1x4_SUB1:\r
+\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x4_SAVE\r
+\r
+ZGEMM_L1x4_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L1x4_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L1x4_SUB2_LOOP:\r
+ LOAD1x4 0 \r
+ KERNEL1x4_L 64,32, 0,0\r
+ KERNEL1x4_L 64,32, 1,0\r
+ KERNEL1x4_L 64,32, 2,0\r
+ KERNEL1x4_E 64,32, 3,1\r
+ bdnz ZGEMM_L1x4_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L1x4_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L1x4_SUB2_2\r
+ LOAD1x4 0 \r
+ KERNEL1x4_L 64,32, 0,0\r
+ KERNEL1x4_E 64,32, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L1x4_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L1x4_SUB2_1\r
+ LOAD1x4 0 \r
+ KERNEL1x4_E 64,32, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L1x4_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L1x4_SAVE \r
+ KERNEL1x4 \r
+\r
+ZGEMM_L1x4_SAVE:\r
+\r
+ SAVE1x4\r
+\r
+ZGEMM_L1x4_END:\r
+\r
+ZGEMM_L1x2_BEGIN:\r
+\r
+\r
+ andi. T1, M, 2\r
+ ble ZGEMM_L1x2_END\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 5 /**(K-1) % 16x */ \r
+ ZERO1x2 \r
+ ble ZGEMM_L1x2_SUB0 \r
+\r
+ZGEMM_L1x2_LOOP_START:\r
+ LOAD1x2 0 \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L1x2_LOOP: \r
+ KERNEL1x2_L 32,32,0,0\r
+ KERNEL1x2_L 32,32,1,0 \r
+ KERNEL1x2_L 32,32,2,0\r
+ KERNEL1x2_L 32,32,3,0 \r
+ KERNEL1x2_L 32,32,4,0\r
+ KERNEL1x2_L 32,32,5,0 \r
+ KERNEL1x2_L 32,32,6,0\r
+ KERNEL1x2_L 32,32,7,0 \r
+ KERNEL1x2_L 32,32,8,0\r
+ KERNEL1x2_L 32,32,9,0\r
+ KERNEL1x2_L 32,32,10,0\r
+ KERNEL1x2_L 32,32,11,0 \r
+ KERNEL1x2_L 32,32,12,0\r
+ KERNEL1x2_L 32,32,13,0\r
+ KERNEL1x2_L 32,32,14,0\r
+ KERNEL1x2_L 32,32,15,1 \r
+ bdnz ZGEMM_L1x2_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L1x2_LOOP_END:\r
+ END1x2 AO, BO, 32, 32 \r
+ \r
+ b ZGEMM_L1x2_SUB1\r
+ \r
+ZGEMM_L1x2_SUB0:\r
+\r
+ andi. L, K, 63\r
+ \r
+ b ZGEMM_L1x2_SUB2\r
+\r
+ZGEMM_L1x2_SUB1:\r
+\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x2_SAVE\r
+\r
+ZGEMM_L1x2_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L1x2_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L1x2_SUB2_LOOP:\r
+ LOAD1x2 0 \r
+ KERNEL1x2_L 32,32, 0,0\r
+ KERNEL1x2_L 32,32, 1,0\r
+ KERNEL1x2_L 32,32, 2,0\r
+ KERNEL1x2_E 32,32, 3,1\r
+ bdnz ZGEMM_L1x2_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L1x2_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L1x2_SUB2_2\r
+ LOAD1x2 0 \r
+ KERNEL1x2_L 32,32, 0,0\r
+ KERNEL1x2_E 32,32, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L1x2_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L1x2_SUB2_1\r
+ LOAD1x2 0 \r
+ KERNEL1x2_E 32,32, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L1x2_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L1x2_SAVE \r
+ KERNEL1x2 \r
+ZGEMM_L1x2_SAVE:\r
+\r
+ SAVE1x2\r
+\r
+ZGEMM_L1x2_END:\r
+\r
+ZGEMM_L1x1_BEGIN:\r
+\r
+\r
+ andi. T1, M, 1\r
+ ble ZGEMM_L1x1_END\r
+ mr BO, BBUFFER\r
+ mr T1, K\r
+ addi T1,T1, -1\r
+ srawi. L, T1, 5 /**(K-1) % 16x */ \r
+ ZERO1x1 \r
+ ble ZGEMM_L1x1_SUB0 \r
+\r
+ZGEMM_L1x1_LOOP_START:\r
+\r
+ LOAD1x1 0 \r
+ mtctr L\r
+\r
+ MY_ALIGN\r
+ZGEMM_L1x1_LOOP: \r
+ KERNEL1x1_L 16,32,0,0\r
+ KERNEL1x1_L 16,32,1,0 \r
+ KERNEL1x1_L 16,32,2,0\r
+ KERNEL1x1_L 16,32,3,0 \r
+ KERNEL1x1_L 16,32,4,0\r
+ KERNEL1x1_L 16,32,5,0 \r
+ KERNEL1x1_L 16,32,6,0\r
+ KERNEL1x1_L 16,32,7,0 \r
+ KERNEL1x1_L 16,32,8,0\r
+ KERNEL1x1_L 16,32,9,0\r
+ KERNEL1x1_L 16,32,10,0\r
+ KERNEL1x1_L 16,32,11,0 \r
+ KERNEL1x1_L 16,32,12,0\r
+ KERNEL1x1_L 16,32,13,0\r
+ KERNEL1x1_L 16,32,14,0\r
+ KERNEL1x1_L 16,32,15,1 \r
+ bdnz ZGEMM_L1x1_LOOP\r
+ MY_ALIGN \r
+ZGEMM_L1x1_LOOP_END:\r
+ END1x1 AO, BO, 16, 32 \r
+ \r
+ b ZGEMM_L1x1_SUB1\r
+ \r
+ZGEMM_L1x1_SUB0:\r
+\r
+ andi. L, K, 63\r
+ \r
+ b ZGEMM_L1x1_SUB2\r
+\r
+ZGEMM_L1x1_SUB1:\r
+\r
+ andi. L, T1, 31\r
+ ble ZGEMM_L1x1_SAVE\r
+\r
+ZGEMM_L1x1_SUB2:\r
+ srawi. T1,L, 3\r
+ ble ZGEMM_L1x1_SUB2_4\r
+ mtctr T1\r
+ MY_ALIGN\r
+ZGEMM_L1x1_SUB2_LOOP:\r
+ LOAD1x1 0 \r
+ KERNEL1x1_L 16,32, 0,0\r
+ KERNEL1x1_L 16,32, 1,0\r
+ KERNEL1x1_L 16,32, 2,0\r
+ KERNEL1x1_E 16,32, 3,1\r
+ bdnz ZGEMM_L1x1_SUB2_LOOP \r
+ MY_ALIGN \r
+ZGEMM_L1x1_SUB2_4:\r
+ andi. T1,L, 4\r
+ ble ZGEMM_L1x1_SUB2_2\r
+ LOAD1x1 0 \r
+ KERNEL1x1_L 16,32, 0,0\r
+ KERNEL1x1_E 16,32, 1,1\r
+ MY_ALIGN\r
+ZGEMM_L1x1_SUB2_2:\r
+ andi. T1,L, 2\r
+ ble ZGEMM_L1x1_SUB2_1\r
+ LOAD1x1 0 \r
+ KERNEL1x1_E 16,32, 0,1\r
+ MY_ALIGN \r
+ZGEMM_L1x1_SUB2_1:\r
+ andi. T1,L, 1\r
+ ble ZGEMM_L1x1_SAVE \r
+ KERNEL1x1 \r
+\r
+ZGEMM_L1x1_SAVE:\r
+\r
+ SAVE1x1\r
+\r
+ZGEMM_L1x1_END:\r
+\r
+ZGEMM_L1_END:\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2013-2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+\r
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)\r
+\r
+ #define XSFADD_R1 xsadddp\r
+ #define XSFADD_R2 xssubdp\r
+ #define XSFADD_I1 xsadddp\r
+ #define XSFADD_I2 xsadddp\r
+\r
+#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)\r
+\r
+ #define XSFADD_R1 xsadddp\r
+ #define XSFADD_R2 xsadddp\r
+ #define XSFADD_I1 xssubdp\r
+ #define XSFADD_I2 xsadddp\r
+\r
+#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)\r
+\r
+ #define XSFADD_R1 xsadddp\r
+ #define XSFADD_R2 xsadddp\r
+ #define XSFADD_I1 xsadddp\r
+ #define XSFADD_I2 xssubdp\r
+\r
+#else // CC || CR || RC || RR\r
+\r
+ #define XSFADD_R1 xsadddp\r
+ #define XSFADD_R2 xssubdp\r
+ #define XSFADD_I1 xssubdp\r
+ #define XSFADD_I2 xssubdp\r
+\r
+#endif\r
+\r
+.macro AGGREGATE_INTO_COMPLEX FIRST_V, SECOND_V, OUTPUT_V\r
+ AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7\r
+.endm\r
+\r
+.macro AGGREGATE_INTO_COMPLEX_INNER FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8\r
+ xxlxor \TEMP1, \TEMP1, \TEMP1\r
+ xxlxor \TEMP2, \TEMP2, \TEMP2\r
+ \r
+ xxswapd \SECOND_V, \SECOND_V // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB \r
+\r
+ XSFADD_I1 \TEMP2, \TEMP2, \FIRST_V // realA*imagB\r
+ XSFADD_I2 \TEMP2, \TEMP2, \SECOND_V // imagA*realB\r
+\r
+ xxswapd \FIRST_V, \FIRST_V //imagA*realB, realA*realB -> realA*realB, imagA*realB \r
+ xxswapd \SECOND_V, \SECOND_V // reverse to original imagA*imagB, realA*imagB \r
+\r
+ XSFADD_R1 \TEMP1, \TEMP1, \FIRST_V // realA*realB\r
+ XSFADD_R2 \TEMP1, \TEMP1, \SECOND_V // imagA*imagB\r
+\r
+ xsmuldp \TEMP3, \TEMP2, alpha_i // imag*alpha_i\r
+ xsmuldp \TEMP4, \TEMP2, alpha_r // imag*alpha_r \r
+ xsmuldp \TEMP5, \TEMP1, alpha_r // real*alpha_r \r
+ xsmuldp \TEMP6, \TEMP1, alpha_i // real*alpha_i\r
+\r
+ xssubdp \TEMP7, \TEMP5, \TEMP3 // real*alpha_r - imag*alpha_i\r
+ xsadddp \TEMP8, \TEMP6, \TEMP4 // real*alpha_i + imag*alpha_r\r
+ xxpermdi \OUTPUT_V, \TEMP8, \TEMP7, 0 // merge real and imag part\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=2 and M=8\r
+**********************************************************************************************/\r
+\r
+#define unit_size 16\r
+#define DISP32(ind,disp) (ind*unit_size*32+disp)\r
+#define DISP16(ind,disp) (ind*unit_size*16+disp)\r
+#define DISP8(ind,disp) (ind*unit_size*8+disp)\r
+#define DISP4(ind,disp) (ind*unit_size*4+disp)\r
+#define DISP2(ind,disp) (ind*unit_size*2+disp)\r
+#define DISP1(ind,disp) (ind*unit_size+disp)\r
+\r
+.macro Zero2x8\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47\r
+ xxlxor vs48, vs48, vs48\r
+ xxlxor vs49, vs49, vs49\r
+ xxlxor vs50, vs50, vs50\r
+ xxlxor vs51, vs51, vs51 \r
+ xxlxor vs52, vs52, vs52\r
+ xxlxor vs53, vs53, vs53\r
+ xxlxor vs54, vs54, vs54\r
+ xxlxor vs55, vs55, vs55 \r
+ xxlxor vs56, vs56, vs56\r
+ xxlxor vs57, vs57, vs57\r
+ xxlxor vs58, vs58, vs58\r
+ xxlxor vs59, vs59, vs59 \r
+ xxlxor vs60, vs60, vs60\r
+ xxlxor vs61, vs61, vs61\r
+ xxlxor vs62, vs62, vs62\r
+ xxlxor vs63, vs63, vs63 \r
+.endm\r
+\r
+.macro LOAD2x8 Zero\r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B\r
+ lxv vs18, 32(BO) // load real part from B\r
+ lxv vs19, 48(BO) // load imag part from B\r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
+\r
+ lxv vs4, 64(AO) // load real,imag from A\r
+ lxv vs5, 80(AO) // load real,imag from A\r
+ lxv vs6, 96(AO) // load real,imag from A\r
+ lxv vs7, 112(AO) // load real,imag from A\r
+\r
+.if \Zero==1\r
+ Zero2x8 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END2x8_NORMAL\r
+ END2x8 AO,BO,128,64\r
+.endm\r
+\r
+.macro END2x8 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
+\r
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real\r
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real\r
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real\r
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real\r
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real\r
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real\r
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL2x8_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
+\r
+ xvmaddadp vs48, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs50, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs52, vs2, vs18 // real*real, imag*real\r
+ xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs54, vs3, vs18 // real*real, imag*real\r
+ xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs56, vs4, vs18 // real*real, imag*real\r
+ xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs58, vs5, vs18 // real*real, imag*real\r
+ xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs60, vs6, vs18 // real*real, imag*real\r
+ xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs62, vs7, vs18 // real*real, imag*real\r
+ xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag\r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
+ addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP16(\Index,256)\r
+ addi \BREG, \BREG, DISP8(\Index,128)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real\r
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real\r
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real\r
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real\r
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag\r
+\r
+ xvmaddadp vs48, vs8, vs22 // real*real, imag*real\r
+ xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs50, vs9, vs22 // real*real, imag*real\r
+ xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs52, vs10, vs22 // real*real, imag*real\r
+ xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs54, vs11, vs22 // real*real, imag*real\r
+ xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs56, vs12, vs22 // real*real, imag*real\r
+ xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs58, vs13, vs22 // real*real, imag*real\r
+ xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs60, vs14, vs22 // real*real, imag*real\r
+ xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs62, vs15, vs22 // real*real, imag*real\r
+ xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL2x8 \r
+ LOAD2x8 0\r
+ END2x8 AO, BO, 128,64 \r
+.endm\r
+\r
+.macro SAVE2x8\r
+\r
+ mr T1, CO\r
+ addi T2, T1, 64\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+ lxv vs18, 32(T1)\r
+ lxv vs19, 48(T1)\r
+ lxv vs20, 0(T2)\r
+ lxv vs21, 16(T2)\r
+ lxv vs22, 32(T2)\r
+ lxv vs23, 48(T2)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
+ AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
+ AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
+ AGGREGATE_INTO_COMPLEX vs40,vs41,vs12\r
+ AGGREGATE_INTO_COMPLEX vs42,vs43,vs13\r
+ AGGREGATE_INTO_COMPLEX vs44,vs45,vs14\r
+ AGGREGATE_INTO_COMPLEX vs46,vs47,vs15\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+ xvadddp vs10, vs10, vs18\r
+ xvadddp vs11, vs11, vs19\r
+ xvadddp vs12, vs12, vs20\r
+ xvadddp vs13, vs13, vs21\r
+ xvadddp vs14, vs14, vs22\r
+ xvadddp vs15, vs15, vs23\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ stxv vs10, 32(T1)\r
+ stxv vs11, 48(T1)\r
+ stxv vs12, 0(T2)\r
+ stxv vs13, 16(T2)\r
+ stxv vs14, 32(T2)\r
+ stxv vs15, 48(T2)\r
+\r
+ add T1, T1, LDC\r
+ add T2, T2, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+ lxv vs18, 32(T1)\r
+ lxv vs19, 48(T1)\r
+ lxv vs20, 0(T2)\r
+ lxv vs21, 16(T2)\r
+ lxv vs22, 32(T2)\r
+ lxv vs23, 48(T2)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs48,vs49,vs8\r
+ AGGREGATE_INTO_COMPLEX vs50,vs51,vs9\r
+ AGGREGATE_INTO_COMPLEX vs52,vs53,vs10\r
+ AGGREGATE_INTO_COMPLEX vs54,vs55,vs11\r
+ AGGREGATE_INTO_COMPLEX vs56,vs57,vs12\r
+ AGGREGATE_INTO_COMPLEX vs58,vs59,vs13\r
+ AGGREGATE_INTO_COMPLEX vs60,vs61,vs14\r
+ AGGREGATE_INTO_COMPLEX vs62,vs63,vs15\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+ xvadddp vs10, vs10, vs18\r
+ xvadddp vs11, vs11, vs19\r
+ xvadddp vs12, vs12, vs20\r
+ xvadddp vs13, vs13, vs21\r
+ xvadddp vs14, vs14, vs22\r
+ xvadddp vs15, vs15, vs23\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ stxv vs10, 32(T1)\r
+ stxv vs11, 48(T1)\r
+ stxv vs12, 0(T2)\r
+ stxv vs13, 16(T2)\r
+ stxv vs14, 32(T2)\r
+ stxv vs15, 48(T2)\r
+ \r
+ addi CO, CO, 128\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=2 and M=4\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x4\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47 \r
+.endm\r
+\r
+.macro LOAD2x4 Zero\r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B\r
+ lxv vs18, 32(BO) // load real part from B\r
+ lxv vs19, 48(BO) // load imag part from B\r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
+ \r
+.if \Zero==1\r
+ Zero2x4 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END2x4_NORMAL\r
+ END2x4 AO,BO,64,64\r
+.endm\r
+\r
+.macro END2x4 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+\r
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real\r
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real\r
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL2x4_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+\r
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real\r
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real\r
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag\r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)\r
+ addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP8(\Index,128)\r
+ addi \BREG, \BREG, DISP8(\Index,128)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
+ \r
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real\r
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real\r
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real\r
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real\r
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL2x4 \r
+ LOAD2x4 0\r
+ END2x4 AO, BO, 64,64 \r
+.endm\r
+\r
+.macro SAVE2x4\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+ lxv vs18, 32(T1)\r
+ lxv vs19, 48(T1)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
+ AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
+ AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+ xvadddp vs10, vs10, vs18\r
+ xvadddp vs11, vs11, vs19\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ stxv vs10, 32(T1)\r
+ stxv vs11, 48(T1)\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+ lxv vs18, 32(T1)\r
+ lxv vs19, 48(T1)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs40,vs41,vs8\r
+ AGGREGATE_INTO_COMPLEX vs42,vs43,vs9\r
+ AGGREGATE_INTO_COMPLEX vs44,vs45,vs10\r
+ AGGREGATE_INTO_COMPLEX vs46,vs47,vs11\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+ xvadddp vs10, vs10, vs18\r
+ xvadddp vs11, vs11, vs19\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ stxv vs10, 32(T1)\r
+ stxv vs11, 48(T1)\r
+ \r
+ addi CO, CO, 64\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=2 and M=2\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x2\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39 \r
+.endm\r
+\r
+.macro LOAD2x2 Zero\r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B\r
+ lxv vs18, 32(BO) // load real part from B\r
+ lxv vs19, 48(BO) // load imag part from B\r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A \r
+ \r
+.if \Zero==1\r
+ Zero2x2 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END2x2_NORMAL\r
+ END2x2 AO,BO,32,64\r
+.endm\r
+\r
+.macro END2x2 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag \r
+\r
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag \r
+ \r
+.endm\r
+\r
+.macro KERNEL2x2_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL2x2_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag \r
+\r
+ xvmaddadp vs36, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag \r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
+ addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP4(\Index,64)\r
+ addi \BREG, \BREG, DISP8(\Index,128)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag \r
+ \r
+ xvmaddadp vs36, vs8, vs22 // real*real, imag*real\r
+ xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs9, vs22 // real*real, imag*real\r
+ xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag \r
+ \r
+.endm\r
+\r
+.macro KERNEL2x2 \r
+ LOAD2x2 0\r
+ END2x2 AO, BO, 32,64 \r
+.endm\r
+\r
+.macro SAVE2x2\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 \r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs36,vs37,vs8\r
+ AGGREGATE_INTO_COMPLEX vs38,vs39,vs9\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ \r
+ addi CO, CO, 32\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=2 and M=1\r
+**********************************************************************************************/\r
+\r
+.macro Zero2x1\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35 \r
+.endm\r
+\r
+.macro LOAD2x1 Zero\r
+ lxv vs0, 0(AO) // load real,imag from A \r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B\r
+ lxv vs18, 32(BO) // load real part from B\r
+ lxv vs19, 48(BO) // load imag part from B\r
+\r
+.if \Zero==1\r
+ Zero2x1 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END2x1_NORMAL\r
+ END2x1 AO,BO,16,64\r
+.endm\r
+\r
+.macro END2x1 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+\r
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag \r
+ \r
+.endm\r
+\r
+.macro KERNEL2x1_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL2x1_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+lxv vs20, DISP8(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP8(\Index,16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs22, DISP8(\Index,32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs23, DISP8(\Index,48+\OffsetB)(\BREG) // load imag part from B\r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+\r
+ xvmaddadp vs34, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag \r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs16, DISP8(\Index, 64+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP8(\Index,64+16+\OffsetB)(\BREG) // load imag part from B\r
+ lxv vs18, DISP8(\Index,64+32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs19, DISP8(\Index,64+48+\OffsetB)(\BREG) // load imag part from B\r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
+ addi \BREG, \BREG, DISP8(\Index,64+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP2(\Index,32)\r
+ addi \BREG, \BREG, DISP8(\Index,128)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag \r
+ \r
+ xvmaddadp vs34, vs8, vs22 // real*real, imag*real\r
+ xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag \r
+ \r
+.endm\r
+\r
+.macro KERNEL2x1 \r
+ LOAD2x1 0\r
+ END2x1 AO, BO, 16,64 \r
+.endm\r
+\r
+.macro SAVE2x1\r
+\r
+ mr T1, CO\r
+#ifndef TRMMKERNEL\r
+ lxv vs16, 0(T1)\r
+#endif\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+\r
+#ifndef TRMMKERNEL\r
+ xvadddp vs8, vs8, vs16\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+\r
+ add T1, T1, LDC\r
+\r
+#ifndef TRMMKERNEL\r
+ lxv vs16, 0(T1)\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs8\r
+\r
+#ifndef TRMMKERNEL\r
+ xvadddp vs8, vs8, vs16\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+\r
+ addi CO, CO, 16\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=1 and M=8\r
+**********************************************************************************************/\r
+.macro Zero1x8\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39\r
+ xxlxor vs40, vs40, vs40\r
+ xxlxor vs41, vs41, vs41\r
+ xxlxor vs42, vs42, vs42\r
+ xxlxor vs43, vs43, vs43\r
+ xxlxor vs44, vs44, vs44\r
+ xxlxor vs45, vs45, vs45\r
+ xxlxor vs46, vs46, vs46\r
+ xxlxor vs47, vs47, vs47 \r
+.endm\r
+\r
+.macro LOAD1x8 Zero\r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B \r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
+\r
+ lxv vs4, 64(AO) // load real,imag from A\r
+ lxv vs5, 80(AO) // load real,imag from A\r
+ lxv vs6, 96(AO) // load real,imag from A\r
+ lxv vs7, 112(AO) // load real,imag from A\r
+\r
+.if \Zero==1\r
+ Zero1x8 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END1x8_NORMAL\r
+ END1x8 AO,BO,128,32\r
+.endm\r
+\r
+.macro END1x8 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL1x8_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs10, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs12, DISP16(\Index, 64 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs13, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs14, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs15, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs40, vs4, vs16 // real*real, imag*real\r
+ xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs5, vs16 // real*real, imag*real\r
+ xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs6, vs16 // real*real, imag*real\r
+ xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs7, vs16 // real*real, imag*real\r
+ xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag\r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs2, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ lxv vs4, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs5, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs6, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs7, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP4(\Index,48+\OffsetB)(\BREG) // load imag part from B \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP16(\Index,128+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP16(\Index,256)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs40, vs12, vs20 // real*real, imag*real\r
+ xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs13, vs20 // real*real, imag*real\r
+ xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs14, vs20 // real*real, imag*real\r
+ xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs15, vs20 // real*real, imag*real\r
+ xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL1x8 \r
+ LOAD1x8 0\r
+ END1x8 AO, BO, 128,32 \r
+.endm\r
+\r
+.macro SAVE1x8\r
+\r
+ mr T1, CO\r
+ addi T2, T1, 64\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+ lxv vs18, 32(T1)\r
+ lxv vs19, 48(T1)\r
+ lxv vs20, 0(T2)\r
+ lxv vs21, 16(T2)\r
+ lxv vs22, 32(T2)\r
+ lxv vs23, 48(T2)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
+ AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
+ AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
+ AGGREGATE_INTO_COMPLEX vs40,vs41,vs12\r
+ AGGREGATE_INTO_COMPLEX vs42,vs43,vs13\r
+ AGGREGATE_INTO_COMPLEX vs44,vs45,vs14\r
+ AGGREGATE_INTO_COMPLEX vs46,vs47,vs15\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+ xvadddp vs10, vs10, vs18\r
+ xvadddp vs11, vs11, vs19\r
+ xvadddp vs12, vs12, vs20\r
+ xvadddp vs13, vs13, vs21\r
+ xvadddp vs14, vs14, vs22\r
+ xvadddp vs15, vs15, vs23\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ stxv vs10, 32(T1)\r
+ stxv vs11, 48(T1)\r
+ stxv vs12, 0(T2)\r
+ stxv vs13, 16(T2)\r
+ stxv vs14, 32(T2)\r
+ stxv vs15, 48(T2)\r
+\r
+ addi CO, CO, 128\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=1 and M=4\r
+**********************************************************************************************/\r
+\r
+.macro Zero1x4\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35\r
+ xxlxor vs36, vs36, vs36\r
+ xxlxor vs37, vs37, vs37\r
+ xxlxor vs38, vs38, vs38\r
+ xxlxor vs39, vs39, vs39 \r
+.endm\r
+\r
+.macro LOAD1x4 Zero\r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B \r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A\r
+ lxv vs2, 32(AO) // load real,imag from A\r
+ lxv vs3, 48(AO) // load real,imag from A\r
+ \r
+.if \Zero==1\r
+ Zero1x4 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END1x4_NORMAL\r
+ END1x4 AO,BO,64,32\r
+.endm\r
+\r
+.macro END1x4 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL1x4_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs10, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs11, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs2, vs16 // real*real, imag*real\r
+ xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs3, vs16 // real*real, imag*real\r
+ xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag\r
+\r
+ xvmaddadp vs40, vs0, vs18 // real*real, imag*real\r
+ xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs1, vs18 // real*real, imag*real\r
+ xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs2, vs18 // real*real, imag*real\r
+ xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs3, vs18 // real*real, imag*real\r
+ xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag\r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP8(\Index,64+ \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs2, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs3, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP8(\Index,64+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP8(\Index,128)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs36, vs10, vs20 // real*real, imag*real\r
+ xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs38, vs11, vs20 // real*real, imag*real\r
+ xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag\r
+ \r
+ xvmaddadp vs40, vs8, vs22 // real*real, imag*real\r
+ xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs42, vs9, vs22 // real*real, imag*real\r
+ xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs44, vs10, vs22 // real*real, imag*real\r
+ xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag\r
+ xvmaddadp vs46, vs11, vs22 // real*real, imag*real\r
+ xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL1x4 \r
+ LOAD1x4 0\r
+ END1x4 AO, BO, 64,32 \r
+.endm\r
+\r
+.macro SAVE1x4\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+ lxv vs18, 32(T1)\r
+ lxv vs19, 48(T1)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs9\r
+ AGGREGATE_INTO_COMPLEX vs36,vs37,vs10\r
+ AGGREGATE_INTO_COMPLEX vs38,vs39,vs11\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+ xvadddp vs10, vs10, vs18\r
+ xvadddp vs11, vs11, vs19\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+ stxv vs10, 32(T1)\r
+ stxv vs11, 48(T1) \r
+ \r
+ addi CO, CO, 64\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=1 and M=2\r
+**********************************************************************************************/\r
+\r
+.macro Zero1x2\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33\r
+ xxlxor vs34, vs34, vs34\r
+ xxlxor vs35, vs35, vs35 \r
+.endm\r
+\r
+.macro LOAD1x2 Zero\r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B \r
+\r
+ lxv vs0, 0(AO) // load real,imag from A\r
+ lxv vs1, 16(AO) // load real,imag from A \r
+ \r
+.if \Zero==1\r
+ Zero1x2 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END1x2_NORMAL\r
+ END1x2 AO,BO,32,32\r
+.endm\r
+\r
+.macro END1x2 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag\r
+ \r
+.endm\r
+\r
+.macro KERNEL1x2_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL1x2_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs9, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs1, vs16 // real*real, imag*real\r
+ xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag \r
+.if \Complete==0\r
+ lxv vs0, DISP4(\Index,32 + \OffsetA)(\AREG) // load real,imag from A\r
+ lxv vs1, DISP4(\Index,48+ \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B \r
+.endif\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP4(\Index,32+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP4(\Index,64)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif \r
+\r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag\r
+ xvmaddadp vs34, vs9, vs20 // real*real, imag*real\r
+ xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag\r
+\r
+.endm\r
+\r
+.macro KERNEL1x2 \r
+ LOAD1x2 0\r
+ END1x2 AO, BO, 32,32 \r
+.endm\r
+\r
+.macro SAVE1x2\r
+\r
+ mr T1, CO\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ lxv vs16, 0(T1)\r
+ lxv vs17, 16(T1)\r
+\r
+#endif\r
+\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+ AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 \r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ xvadddp vs8, vs8, vs16\r
+ xvadddp vs9, vs9, vs17\r
+\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+ stxv vs9, 16(T1)\r
+\r
+addi CO, CO, 32\r
+\r
+.endm\r
+\r
+/**********************************************************************************************\r
+* Macros for N=1 and M=1\r
+**********************************************************************************************/\r
+\r
+.macro Zero1x1\r
+ xxlxor vs32, vs32, vs32\r
+ xxlxor vs33, vs33, vs33 \r
+.endm\r
+\r
+.macro LOAD1x1 Zero\r
+ lxv vs0, 0(AO) // load real,imag from A \r
+\r
+ lxv vs16, 0(BO) // load real part from B\r
+ lxv vs17, 16(BO) // load imag part from B \r
+\r
+.if \Zero==1\r
+ Zero1x1 \r
+.endif\r
+\r
+.endm\r
+\r
+.macro END1x1_NORMAL\r
+ END1x1 AO,BO,16,32\r
+.endm\r
+\r
+.macro END1x1 AREG, BREG, OffsetA, OffsetB\r
+\r
+.if \OffsetA != 0 \r
+ addi \AREG, \AREG, \OffsetA \r
+.endif \r
+.if \OffsetB != 0 \r
+ addi \BREG, \BREG, \OffsetB \r
+.endif\r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+ \r
+ \r
+.endm\r
+\r
+.macro KERNEL1x1_L OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0\r
+.endm\r
+\r
+.macro KERNEL1x1_E OffsetA,OffsetB, Index,IsLast \r
+ KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1\r
+.endm\r
+\r
+.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete\r
+\r
+ lxv vs8, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A\r
+\r
+ lxv vs20, DISP4(\Index, 0+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs21, DISP4(\Index,16+\OffsetB)(\BREG) // load imag part from B \r
+\r
+ xvmaddadp vs32, vs0, vs16 // real*real, imag*real\r
+ xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag \r
+\r
+.if \Complete==0\r
+ lxv vs0, DISP2(\Index,16 + \OffsetA)(\AREG) // load real,imag from A \r
+\r
+ lxv vs16, DISP4(\Index, 32+\OffsetB)(\BREG) // load real part from B\r
+ lxv vs17, DISP4(\Index,32+16+\OffsetB)(\BREG) // load imag part from B \r
+.endif\r
+\r
+\r
+.if \IsLast==1 \r
+.if \Complete==1\r
+ addi \AREG, \AREG, DISP2(\Index,16+\OffsetA)\r
+ addi \BREG, \BREG, DISP4(\Index,32+\OffsetB)\r
+.else \r
+ addi \AREG, \AREG, DISP2(\Index,32)\r
+ addi \BREG, \BREG, DISP4(\Index,64)\r
+.endif\r
+.endif\r
+ \r
+ xvmaddadp vs32, vs8, vs20 // real*real, imag*real\r
+ xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag \r
+ \r
+ \r
+.endm\r
+\r
+.macro KERNEL1x1 \r
+ LOAD1x1 0\r
+ END1x1 AO, BO, 16,32 \r
+\r
+.endm \r
+\r
+.macro SAVE1x1\r
+\r
+ mr T1, CO\r
+#ifndef TRMMKERNEL\r
+ lxv vs16, 0(T1)\r
+#endif\r
+ AGGREGATE_INTO_COMPLEX vs32,vs33,vs8\r
+\r
+#ifndef TRMMKERNEL\r
+ xvadddp vs8, vs8, vs16\r
+#endif\r
+\r
+ stxv vs8, 0(T1)\r
+\r
+addi CO, CO, 16\r
+\r
+.endm\r
+\r
+\r
+.macro ZCOPYB_2\r
+\r
+ lxv vs32, 0(BO)\r
+ lxv vs33, 16(BO) \r
+ addi BO, BO, 32\r
+ xxspltd vs40, vs32, 1\r
+ xxspltd vs41, vs32, 0 \r
+ xxspltd vs42, vs33, 1\r
+ xxspltd vs43, vs33, 0\r
+\r
+ stxv vs40, 0(BBO)\r
+ stxv vs41, 16(BBO)\r
+ stxv vs42, 32(BBO)\r
+ stxv vs43, 48(BBO)\r
+ addi BBO, BBO, 64\r
+\r
+.endm\r
+\r
+.macro ZCOPYB_1\r
+\r
+ lxv vs32, 0(BO) \r
+ addi BO, BO, 16\r
+ xxspltd vs40, vs32, 1\r
+ xxspltd vs41, vs32, 0 \r
+ stxv vs40, 0(BBO)\r
+ stxv vs41, 16(BBO)\r
+\r
+ addi BBO, BBO, 32\r
+\r
+.endm\r
+\r
+.macro ZCOPYB_8\r
+\r
+ lxv vs32, 0(BO)\r
+ lxv vs33, 16(BO)\r
+ lxv vs34, 32(BO)\r
+ lxv vs35, 48(BO) \r
+\r
+ lxv vs36, 64+0(BO)\r
+ lxv vs37, 64+16(BO)\r
+ lxv vs38, 64+32(BO)\r
+ lxv vs39, 64+48(BO) \r
+ addi BO, BO, 128\r
+ xxspltd vs40, vs32, 1\r
+ xxspltd vs41, vs32, 0\r
+ xxspltd vs42, vs33, 1\r
+ xxspltd vs43, vs33, 0\r
+ xxspltd vs44, vs34, 1\r
+ xxspltd vs45, vs34, 0\r
+ xxspltd vs46, vs35, 1\r
+ xxspltd vs47, vs35, 0 \r
+\r
+ xxspltd vs48, vs36, 1\r
+ xxspltd vs49, vs36, 0\r
+ xxspltd vs50, vs37, 1\r
+ xxspltd vs51, vs37, 0\r
+ xxspltd vs52, vs38, 1\r
+ xxspltd vs53, vs38, 0\r
+ xxspltd vs54, vs39, 1\r
+ xxspltd vs55, vs39, 0\r
+\r
+ stxv vs40, 0(BBO)\r
+ stxv vs41, 16(BBO)\r
+ stxv vs42, 32(BBO)\r
+ stxv vs43, 48(BBO) \r
+\r
+ stxv vs44, 64+0(BBO)\r
+ stxv vs45, 64+16(BBO)\r
+ stxv vs46, 64+32(BBO)\r
+ stxv vs47, 64+48(BBO) \r
+\r
+ stxv vs48, 128+ 0(BBO)\r
+ stxv vs49, 128+ 16(BBO)\r
+ stxv vs50, 128+ 32(BBO)\r
+ stxv vs51, 128+ 48(BBO) \r
+\r
+ stxv vs52, 192 + 0(BBO)\r
+ stxv vs53, 192 + 16(BBO)\r
+ stxv vs54, 192+ 32(BBO)\r
+ stxv vs55, 192 + 48(BBO)\r
+ addi BBO, BBO, 256\r
+\r
+.endm\r
+\r