--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+{
+ BLASLONG i,j;
+ BLASLONG idx=0;
+ BLASLONG ii;
+ FLOAT *src0,*src1,*src2,*src3,*dest0;
+ for (j=0; j<col/4; j+=1)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src2 = src1+2*srcdim;
+ src3 = src2+2*srcdim;
+ src = src3+2*srcdim;
+ dest0 = dest;
+ ii = (row<<3);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src0[2];
+ dest0[9] = src0[3];
+ dest0[10] = src1[2];
+ dest0[11] = src1[3];
+ dest0[12] = src2[2];
+ dest0[13] = src2[3];
+ dest0[14] = src3[2];
+ dest0[15] = src3[3];
+ dest0[16] = src0[4];
+ dest0[17] = src0[5];
+ dest0[18] = src1[4];
+ dest0[19] = src1[5];
+ dest0[20] = src2[4];
+ dest0[21] = src2[5];
+ dest0[22] = src3[4];
+ dest0[23] = src3[5];
+ dest0[24] = src0[6];
+ dest0[25] = src0[7];
+ dest0[26] = src1[6];
+ dest0[27] = src1[7];
+ dest0[28] = src2[6];
+ dest0[29] = src2[7];
+ dest0[30] = src3[6];
+ dest0[31] = src3[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ src2 = src2+8;
+ src3 = src3+8;
+ ii = (4<<3);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src0[2];
+ dest0[9] = src0[3];
+ dest0[10] = src1[2];
+ dest0[11] = src1[3];
+ dest0[12] = src2[2];
+ dest0[13] = src2[3];
+ dest0[14] = src3[2];
+ dest0[15] = src3[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ src2 = src2+4;
+ src3 = src3+4;
+ ii = (2<<3);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ src2 = src2+2;
+ src3 = src3+2;
+ ii = (1<<3);
+ dest0 = dest0+ii;
+ }
+ }
+ if (col&2)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src = src1+2*srcdim;
+ dest0 = dest;
+ ii = (row<<2);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src0[2];
+ dest0[5] = src0[3];
+ dest0[6] = src1[2];
+ dest0[7] = src1[3];
+ dest0[8] = src0[4];
+ dest0[9] = src0[5];
+ dest0[10] = src1[4];
+ dest0[11] = src1[5];
+ dest0[12] = src0[6];
+ dest0[13] = src0[7];
+ dest0[14] = src1[6];
+ dest0[15] = src1[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ ii = (4<<2);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src0[2];
+ dest0[5] = src0[3];
+ dest0[6] = src1[2];
+ dest0[7] = src1[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ ii = (2<<2);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ ii = (1<<2);
+ dest0 = dest0+ii;
+ }
+ }
+ if (col&1)
+ {
+ src0 = src;
+ src = src0+2*srcdim;
+ dest0 = dest;
+ ii = (row<<1);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ src0 = src0+8;
+ ii = (4<<1);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ src0 = src0+4;
+ ii = (2<<1);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ src0 = src0+2;
+ ii = (1<<1);
+ dest0 = dest0+ii;
+ }
+ }
+ return 0;
+}
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+{
+ BLASLONG i,j;
+ BLASLONG idx=0;
+ BLASLONG ii;
+ FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0;
+ for (j=0; j<col/8; j+=1)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src2 = src1+2*srcdim;
+ src3 = src2+2*srcdim;
+ src4 = src3+2*srcdim;
+ src5 = src4+2*srcdim;
+ src6 = src5+2*srcdim;
+ src7 = src6+2*srcdim;
+ src = src7+2*srcdim;
+ dest0 = dest;
+ ii = (row<<4);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src4[0];
+ dest0[9] = src4[1];
+ dest0[10] = src5[0];
+ dest0[11] = src5[1];
+ dest0[12] = src6[0];
+ dest0[13] = src6[1];
+ dest0[14] = src7[0];
+ dest0[15] = src7[1];
+ dest0[16] = src0[2];
+ dest0[17] = src0[3];
+ dest0[18] = src1[2];
+ dest0[19] = src1[3];
+ dest0[20] = src2[2];
+ dest0[21] = src2[3];
+ dest0[22] = src3[2];
+ dest0[23] = src3[3];
+ dest0[24] = src4[2];
+ dest0[25] = src4[3];
+ dest0[26] = src5[2];
+ dest0[27] = src5[3];
+ dest0[28] = src6[2];
+ dest0[29] = src6[3];
+ dest0[30] = src7[2];
+ dest0[31] = src7[3];
+ dest0[32] = src0[4];
+ dest0[33] = src0[5];
+ dest0[34] = src1[4];
+ dest0[35] = src1[5];
+ dest0[36] = src2[4];
+ dest0[37] = src2[5];
+ dest0[38] = src3[4];
+ dest0[39] = src3[5];
+ dest0[40] = src4[4];
+ dest0[41] = src4[5];
+ dest0[42] = src5[4];
+ dest0[43] = src5[5];
+ dest0[44] = src6[4];
+ dest0[45] = src6[5];
+ dest0[46] = src7[4];
+ dest0[47] = src7[5];
+ dest0[48] = src0[6];
+ dest0[49] = src0[7];
+ dest0[50] = src1[6];
+ dest0[51] = src1[7];
+ dest0[52] = src2[6];
+ dest0[53] = src2[7];
+ dest0[54] = src3[6];
+ dest0[55] = src3[7];
+ dest0[56] = src4[6];
+ dest0[57] = src4[7];
+ dest0[58] = src5[6];
+ dest0[59] = src5[7];
+ dest0[60] = src6[6];
+ dest0[61] = src6[7];
+ dest0[62] = src7[6];
+ dest0[63] = src7[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ src2 = src2+8;
+ src3 = src3+8;
+ src4 = src4+8;
+ src5 = src5+8;
+ src6 = src6+8;
+ src7 = src7+8;
+ ii = (4<<4);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src4[0];
+ dest0[9] = src4[1];
+ dest0[10] = src5[0];
+ dest0[11] = src5[1];
+ dest0[12] = src6[0];
+ dest0[13] = src6[1];
+ dest0[14] = src7[0];
+ dest0[15] = src7[1];
+ dest0[16] = src0[2];
+ dest0[17] = src0[3];
+ dest0[18] = src1[2];
+ dest0[19] = src1[3];
+ dest0[20] = src2[2];
+ dest0[21] = src2[3];
+ dest0[22] = src3[2];
+ dest0[23] = src3[3];
+ dest0[24] = src4[2];
+ dest0[25] = src4[3];
+ dest0[26] = src5[2];
+ dest0[27] = src5[3];
+ dest0[28] = src6[2];
+ dest0[29] = src6[3];
+ dest0[30] = src7[2];
+ dest0[31] = src7[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ src2 = src2+4;
+ src3 = src3+4;
+ src4 = src4+4;
+ src5 = src5+4;
+ src6 = src6+4;
+ src7 = src7+4;
+ ii = (2<<4);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src4[0];
+ dest0[9] = src4[1];
+ dest0[10] = src5[0];
+ dest0[11] = src5[1];
+ dest0[12] = src6[0];
+ dest0[13] = src6[1];
+ dest0[14] = src7[0];
+ dest0[15] = src7[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ src2 = src2+2;
+ src3 = src3+2;
+ src4 = src4+2;
+ src5 = src5+2;
+ src6 = src6+2;
+ src7 = src7+2;
+ ii = (1<<4);
+ dest0 = dest0+ii;
+ }
+ }
+ if (col&4)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src2 = src1+2*srcdim;
+ src3 = src2+2*srcdim;
+ src = src3+2*srcdim;
+ dest0 = dest;
+ ii = (row<<3);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src0[2];
+ dest0[9] = src0[3];
+ dest0[10] = src1[2];
+ dest0[11] = src1[3];
+ dest0[12] = src2[2];
+ dest0[13] = src2[3];
+ dest0[14] = src3[2];
+ dest0[15] = src3[3];
+ dest0[16] = src0[4];
+ dest0[17] = src0[5];
+ dest0[18] = src1[4];
+ dest0[19] = src1[5];
+ dest0[20] = src2[4];
+ dest0[21] = src2[5];
+ dest0[22] = src3[4];
+ dest0[23] = src3[5];
+ dest0[24] = src0[6];
+ dest0[25] = src0[7];
+ dest0[26] = src1[6];
+ dest0[27] = src1[7];
+ dest0[28] = src2[6];
+ dest0[29] = src2[7];
+ dest0[30] = src3[6];
+ dest0[31] = src3[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ src2 = src2+8;
+ src3 = src3+8;
+ ii = (4<<3);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ dest0[8] = src0[2];
+ dest0[9] = src0[3];
+ dest0[10] = src1[2];
+ dest0[11] = src1[3];
+ dest0[12] = src2[2];
+ dest0[13] = src2[3];
+ dest0[14] = src3[2];
+ dest0[15] = src3[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ src2 = src2+4;
+ src3 = src3+4;
+ ii = (2<<3);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src2[0];
+ dest0[5] = src2[1];
+ dest0[6] = src3[0];
+ dest0[7] = src3[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ src2 = src2+2;
+ src3 = src3+2;
+ ii = (1<<3);
+ dest0 = dest0+ii;
+ }
+ }
+ if (col&2)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src = src1+2*srcdim;
+ dest0 = dest;
+ ii = (row<<2);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src0[2];
+ dest0[5] = src0[3];
+ dest0[6] = src1[2];
+ dest0[7] = src1[3];
+ dest0[8] = src0[4];
+ dest0[9] = src0[5];
+ dest0[10] = src1[4];
+ dest0[11] = src1[5];
+ dest0[12] = src0[6];
+ dest0[13] = src0[7];
+ dest0[14] = src1[6];
+ dest0[15] = src1[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ ii = (4<<2);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ dest0[4] = src0[2];
+ dest0[5] = src0[3];
+ dest0[6] = src1[2];
+ dest0[7] = src1[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ ii = (2<<2);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src1[0];
+ dest0[3] = src1[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ ii = (1<<2);
+ dest0 = dest0+ii;
+ }
+ }
+ if (col&1)
+ {
+ src0 = src;
+ src = src0+2*srcdim;
+ dest0 = dest;
+ ii = (row<<1);
+ dest = dest+ii;
+ for (i=0; i<row/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ src0 = src0+8;
+ ii = (4<<1);
+ dest0 = dest0+ii;
+ }
+ if (row&2)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ src0 = src0+4;
+ ii = (2<<1);
+ dest0 = dest0+ii;
+ }
+ if (row&1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ src0 = src0+2;
+ ii = (1<<1);
+ dest0 = dest0+ii;
+ }
+ }
+ return 0;
+}
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+{
+ BLASLONG i,j;
+ BLASLONG idx=0;
+ BLASLONG ii;
+ FLOAT *src0,*src1,*src2,*src3,*dest0;
+ FLOAT *dest1,*dest2;
+ ii = col&-4;
+ ii = ii*(2*row);
+ dest2 = dest+ii;
+ ii = col&-2;
+ ii = ii*(2*row);
+ dest1 = dest+ii;
+ for (j=0; j<row/4; j+=1)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src2 = src1+2*srcdim;
+ src3 = src2+2*srcdim;
+ src = src3+2*srcdim;
+ dest0 = dest;
+ ii = (4<<3);
+ dest = dest+ii;
+ for (i=0; i<col/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ dest0[8] = src1[0];
+ dest0[9] = src1[1];
+ dest0[10] = src1[2];
+ dest0[11] = src1[3];
+ dest0[12] = src1[4];
+ dest0[13] = src1[5];
+ dest0[14] = src1[6];
+ dest0[15] = src1[7];
+ dest0[16] = src2[0];
+ dest0[17] = src2[1];
+ dest0[18] = src2[2];
+ dest0[19] = src2[3];
+ dest0[20] = src2[4];
+ dest0[21] = src2[5];
+ dest0[22] = src2[6];
+ dest0[23] = src2[7];
+ dest0[24] = src3[0];
+ dest0[25] = src3[1];
+ dest0[26] = src3[2];
+ dest0[27] = src3[3];
+ dest0[28] = src3[4];
+ dest0[29] = src3[5];
+ dest0[30] = src3[6];
+ dest0[31] = src3[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ src2 = src2+8;
+ src3 = src3+8;
+ ii = (row<<3);
+ dest0 = dest0+ii;
+ }
+ if (col&2)
+ {
+ dest2[0] = src0[0];
+ dest2[1] = src0[1];
+ dest2[2] = src0[2];
+ dest2[3] = src0[3];
+ dest2[4] = src1[0];
+ dest2[5] = src1[1];
+ dest2[6] = src1[2];
+ dest2[7] = src1[3];
+ dest2[8] = src2[0];
+ dest2[9] = src2[1];
+ dest2[10] = src2[2];
+ dest2[11] = src2[3];
+ dest2[12] = src3[0];
+ dest2[13] = src3[1];
+ dest2[14] = src3[2];
+ dest2[15] = src3[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ src2 = src2+4;
+ src3 = src3+4;
+ dest2 = dest2+16;
+ }
+ if (col&1)
+ {
+ dest1[0] = src0[0];
+ dest1[1] = src0[1];
+ dest1[2] = src1[0];
+ dest1[3] = src1[1];
+ dest1[4] = src2[0];
+ dest1[5] = src2[1];
+ dest1[6] = src3[0];
+ dest1[7] = src3[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ src2 = src2+2;
+ src3 = src3+2;
+ dest1 = dest1+8;
+ }
+ }
+ if (row&2)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src = src1+2*srcdim;
+ dest0 = dest;
+ ii = (2<<3);
+ dest = dest+ii;
+ for (i=0; i<col/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ dest0[8] = src1[0];
+ dest0[9] = src1[1];
+ dest0[10] = src1[2];
+ dest0[11] = src1[3];
+ dest0[12] = src1[4];
+ dest0[13] = src1[5];
+ dest0[14] = src1[6];
+ dest0[15] = src1[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ ii = (row<<3);
+ dest0 = dest0+ii;
+ }
+ if (col&2)
+ {
+ dest2[0] = src0[0];
+ dest2[1] = src0[1];
+ dest2[2] = src0[2];
+ dest2[3] = src0[3];
+ dest2[4] = src1[0];
+ dest2[5] = src1[1];
+ dest2[6] = src1[2];
+ dest2[7] = src1[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ dest2 = dest2+8;
+ }
+ if (col&1)
+ {
+ dest1[0] = src0[0];
+ dest1[1] = src0[1];
+ dest1[2] = src1[0];
+ dest1[3] = src1[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ dest1 = dest1+4;
+ }
+ }
+ if (row&1)
+ {
+ src0 = src;
+ src = src0+2*srcdim;
+ dest0 = dest;
+ ii = (1<<3);
+ dest = dest+ii;
+ for (i=0; i<col/4; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ src0 = src0+8;
+ ii = (row<<3);
+ dest0 = dest0+ii;
+ }
+ if (col&2)
+ {
+ dest2[0] = src0[0];
+ dest2[1] = src0[1];
+ dest2[2] = src0[2];
+ dest2[3] = src0[3];
+ src0 = src0+4;
+ dest2 = dest2+4;
+ }
+ if (col&1)
+ {
+ dest1[0] = src0[0];
+ dest1[1] = src0[1];
+ src0 = src0+2;
+ dest1 = dest1+2;
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest)
+{
+ BLASLONG i,j;
+ BLASLONG idx=0;
+ BLASLONG ii;
+ FLOAT *src0,*src1,*src2,*src3,*dest0;
+ FLOAT *dest1,*dest2,*dest4;
+ ii = col&-8;
+ ii = ii*(2*row);
+ dest4 = dest+ii;
+ ii = col&-4;
+ ii = ii*(2*row);
+ dest2 = dest+ii;
+ ii = col&-2;
+ ii = ii*(2*row);
+ dest1 = dest+ii;
+ for (j=0; j<row/4; j+=1)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src2 = src1+2*srcdim;
+ src3 = src2+2*srcdim;
+ src = src3+2*srcdim;
+ dest0 = dest;
+ ii = (4<<4);
+ dest = dest+ii;
+ for (i=0; i<col/8; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ dest0[8] = src0[8];
+ dest0[9] = src0[9];
+ dest0[10] = src0[10];
+ dest0[11] = src0[11];
+ dest0[12] = src0[12];
+ dest0[13] = src0[13];
+ dest0[14] = src0[14];
+ dest0[15] = src0[15];
+ dest0[16] = src1[0];
+ dest0[17] = src1[1];
+ dest0[18] = src1[2];
+ dest0[19] = src1[3];
+ dest0[20] = src1[4];
+ dest0[21] = src1[5];
+ dest0[22] = src1[6];
+ dest0[23] = src1[7];
+ dest0[24] = src1[8];
+ dest0[25] = src1[9];
+ dest0[26] = src1[10];
+ dest0[27] = src1[11];
+ dest0[28] = src1[12];
+ dest0[29] = src1[13];
+ dest0[30] = src1[14];
+ dest0[31] = src1[15];
+ dest0[32] = src2[0];
+ dest0[33] = src2[1];
+ dest0[34] = src2[2];
+ dest0[35] = src2[3];
+ dest0[36] = src2[4];
+ dest0[37] = src2[5];
+ dest0[38] = src2[6];
+ dest0[39] = src2[7];
+ dest0[40] = src2[8];
+ dest0[41] = src2[9];
+ dest0[42] = src2[10];
+ dest0[43] = src2[11];
+ dest0[44] = src2[12];
+ dest0[45] = src2[13];
+ dest0[46] = src2[14];
+ dest0[47] = src2[15];
+ dest0[48] = src3[0];
+ dest0[49] = src3[1];
+ dest0[50] = src3[2];
+ dest0[51] = src3[3];
+ dest0[52] = src3[4];
+ dest0[53] = src3[5];
+ dest0[54] = src3[6];
+ dest0[55] = src3[7];
+ dest0[56] = src3[8];
+ dest0[57] = src3[9];
+ dest0[58] = src3[10];
+ dest0[59] = src3[11];
+ dest0[60] = src3[12];
+ dest0[61] = src3[13];
+ dest0[62] = src3[14];
+ dest0[63] = src3[15];
+ src0 = src0+16;
+ src1 = src1+16;
+ src2 = src2+16;
+ src3 = src3+16;
+ ii = (row<<4);
+ dest0 = dest0+ii;
+ }
+ if (col&4)
+ {
+ dest4[0] = src0[0];
+ dest4[1] = src0[1];
+ dest4[2] = src0[2];
+ dest4[3] = src0[3];
+ dest4[4] = src0[4];
+ dest4[5] = src0[5];
+ dest4[6] = src0[6];
+ dest4[7] = src0[7];
+ dest4[8] = src1[0];
+ dest4[9] = src1[1];
+ dest4[10] = src1[2];
+ dest4[11] = src1[3];
+ dest4[12] = src1[4];
+ dest4[13] = src1[5];
+ dest4[14] = src1[6];
+ dest4[15] = src1[7];
+ dest4[16] = src2[0];
+ dest4[17] = src2[1];
+ dest4[18] = src2[2];
+ dest4[19] = src2[3];
+ dest4[20] = src2[4];
+ dest4[21] = src2[5];
+ dest4[22] = src2[6];
+ dest4[23] = src2[7];
+ dest4[24] = src3[0];
+ dest4[25] = src3[1];
+ dest4[26] = src3[2];
+ dest4[27] = src3[3];
+ dest4[28] = src3[4];
+ dest4[29] = src3[5];
+ dest4[30] = src3[6];
+ dest4[31] = src3[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ src2 = src2+8;
+ src3 = src3+8;
+ dest4 = dest4+32;
+ }
+ if (col&2)
+ {
+ dest2[0] = src0[0];
+ dest2[1] = src0[1];
+ dest2[2] = src0[2];
+ dest2[3] = src0[3];
+ dest2[4] = src1[0];
+ dest2[5] = src1[1];
+ dest2[6] = src1[2];
+ dest2[7] = src1[3];
+ dest2[8] = src2[0];
+ dest2[9] = src2[1];
+ dest2[10] = src2[2];
+ dest2[11] = src2[3];
+ dest2[12] = src3[0];
+ dest2[13] = src3[1];
+ dest2[14] = src3[2];
+ dest2[15] = src3[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ src2 = src2+4;
+ src3 = src3+4;
+ dest2 = dest2+16;
+ }
+ if (col&1)
+ {
+ dest1[0] = src0[0];
+ dest1[1] = src0[1];
+ dest1[2] = src1[0];
+ dest1[3] = src1[1];
+ dest1[4] = src2[0];
+ dest1[5] = src2[1];
+ dest1[6] = src3[0];
+ dest1[7] = src3[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ src2 = src2+2;
+ src3 = src3+2;
+ dest1 = dest1+8;
+ }
+ }
+ if (row&2)
+ {
+ src0 = src;
+ src1 = src0+2*srcdim;
+ src = src1+2*srcdim;
+ dest0 = dest;
+ ii = (2<<4);
+ dest = dest+ii;
+ for (i=0; i<col/8; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ dest0[8] = src0[8];
+ dest0[9] = src0[9];
+ dest0[10] = src0[10];
+ dest0[11] = src0[11];
+ dest0[12] = src0[12];
+ dest0[13] = src0[13];
+ dest0[14] = src0[14];
+ dest0[15] = src0[15];
+ dest0[16] = src1[0];
+ dest0[17] = src1[1];
+ dest0[18] = src1[2];
+ dest0[19] = src1[3];
+ dest0[20] = src1[4];
+ dest0[21] = src1[5];
+ dest0[22] = src1[6];
+ dest0[23] = src1[7];
+ dest0[24] = src1[8];
+ dest0[25] = src1[9];
+ dest0[26] = src1[10];
+ dest0[27] = src1[11];
+ dest0[28] = src1[12];
+ dest0[29] = src1[13];
+ dest0[30] = src1[14];
+ dest0[31] = src1[15];
+ src0 = src0+16;
+ src1 = src1+16;
+ ii = (row<<4);
+ dest0 = dest0+ii;
+ }
+ if (col&4)
+ {
+ dest4[0] = src0[0];
+ dest4[1] = src0[1];
+ dest4[2] = src0[2];
+ dest4[3] = src0[3];
+ dest4[4] = src0[4];
+ dest4[5] = src0[5];
+ dest4[6] = src0[6];
+ dest4[7] = src0[7];
+ dest4[8] = src1[0];
+ dest4[9] = src1[1];
+ dest4[10] = src1[2];
+ dest4[11] = src1[3];
+ dest4[12] = src1[4];
+ dest4[13] = src1[5];
+ dest4[14] = src1[6];
+ dest4[15] = src1[7];
+ src0 = src0+8;
+ src1 = src1+8;
+ dest4 = dest4+16;
+ }
+ if (col&2)
+ {
+ dest2[0] = src0[0];
+ dest2[1] = src0[1];
+ dest2[2] = src0[2];
+ dest2[3] = src0[3];
+ dest2[4] = src1[0];
+ dest2[5] = src1[1];
+ dest2[6] = src1[2];
+ dest2[7] = src1[3];
+ src0 = src0+4;
+ src1 = src1+4;
+ dest2 = dest2+8;
+ }
+ if (col&1)
+ {
+ dest1[0] = src0[0];
+ dest1[1] = src0[1];
+ dest1[2] = src1[0];
+ dest1[3] = src1[1];
+ src0 = src0+2;
+ src1 = src1+2;
+ dest1 = dest1+4;
+ }
+ }
+ if (row&1)
+ {
+ src0 = src;
+ src = src0+2*srcdim;
+ dest0 = dest;
+ ii = (1<<4);
+ dest = dest+ii;
+ for (i=0; i<col/8; i+=1)
+ {
+ dest0[0] = src0[0];
+ dest0[1] = src0[1];
+ dest0[2] = src0[2];
+ dest0[3] = src0[3];
+ dest0[4] = src0[4];
+ dest0[5] = src0[5];
+ dest0[6] = src0[6];
+ dest0[7] = src0[7];
+ dest0[8] = src0[8];
+ dest0[9] = src0[9];
+ dest0[10] = src0[10];
+ dest0[11] = src0[11];
+ dest0[12] = src0[12];
+ dest0[13] = src0[13];
+ dest0[14] = src0[14];
+ dest0[15] = src0[15];
+ src0 = src0+16;
+ ii = (row<<4);
+ dest0 = dest0+ii;
+ }
+ if (col&4)
+ {
+ dest4[0] = src0[0];
+ dest4[1] = src0[1];
+ dest4[2] = src0[2];
+ dest4[3] = src0[3];
+ dest4[4] = src0[4];
+ dest4[5] = src0[5];
+ dest4[6] = src0[6];
+ dest4[7] = src0[7];
+ src0 = src0+8;
+ dest4 = dest4+8;
+ }
+ if (col&2)
+ {
+ dest2[0] = src0[0];
+ dest2[1] = src0[1];
+ dest2[2] = src0[2];
+ dest2[3] = src0[3];
+ src0 = src0+4;
+ dest2 = dest2+4;
+ }
+ if (col&1)
+ {
+ dest1[0] = src0[0];
+ dest1[1] = src0[1];
+ src0 = src0+2;
+ dest1 = dest1+2;
+ }
+ }
+ return 0;
+}
-SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
-SGEMMINCOPY = gemm_ncopy_4.S
-SGEMMITCOPY = gemm_tcopy_4.S
+SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
+SGEMMINCOPY =
+SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
-SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
-DGEMMINCOPY = dgemm_ncopy_2.S
-DGEMMITCOPY = dgemm_tcopy_2.S
-DGEMMONCOPY = dgemm_ncopy_8.S
-DGEMMOTCOPY = dgemm_tcopy_8.S
+DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
+DGEMMINCOPY = ../generic/gemm_ncopy_8.c
+DGEMMITCOPY = ../generic/gemm_tcopy_8.c
+#DGEMMONCOPY = gemm_ncopy_4.S
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+#DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
-CGEMMINCOPY = zgemm_ncopy_2.S
-CGEMMITCOPY = zgemm_tcopy_2.S
-CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
+CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
-ZGEMMINCOPY = zgemm_ncopy_1.S
-ZGEMMITCOPY = zgemm_tcopy_1.S
+#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
+ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
+ZGEMMINCOPY =
+ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
-ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
-STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
-STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
-STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
-
-DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
-DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
-DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
-DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
-
-CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
-CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
-CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
-CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
-
-ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
-ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
-ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
-ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
+#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
+#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
+#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
+#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
+
+#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
+#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
+#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
+#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
+
+#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
+#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
+#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
+#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
+
+#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
+#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
+#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
+#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define old_bm %rdi
+#define old_bn %rsi
+#define old_bk %rdx
+
+#define bm %r13
+#define bn %r14
+#define bk %r15
+
+#define ALPHA %xmm0
+#define ba %rcx
+#define bb %r8
+#define C %r9
+#define ldc %r10
+
+#define i %r11
+#define k %rax
+
+#define ptrba %rdi
+#define ptrbb %rsi
+#define C0 %rbx
+#define C1 %rbp
+
+#define prebb %r12
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 128
+
+#define old_ldc 8+STACKSIZE(%rsp)
+#define old_offset 16+STACKSIZE(%rsp)
+
+#define MEMALPHA_R 48(%rsp)
+#define MEMALPHA_I 56(%rsp)
+#define j 64(%rsp)
+#define OFFSET 72(%rsp)
+#define kk 80(%rsp)
+#define kkk 88(%rsp)
+
+#else
+
+#define STACKSIZE 512
+
+#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
+#define OLD_A 48 + STACKSIZE(%rsp)
+#define OLD_B 56 + STACKSIZE(%rsp)
+#define OLD_C 64 + STACKSIZE(%rsp)
+#define old_ldc 72 + STACKSIZE(%rsp)
+#define old_offset 80 + STACKSIZE(%rsp)
+
+#define MEMALPHA_R 224(%rsp)
+#define MEMALPHA_I 232(%rsp)
+#define j 240(%rsp)
+#define OFFSET 248(%rsp)
+#define kk 256(%rsp)
+#define kkk 264(%rsp)
+
+#endif
+
+#define PREFETCH0 prefetcht0
+#define PREFETCH1 prefetcht0
+#define PREFETCH2 prefetcht0
+#define PRESIZE 64
+
+#define xvec0 %xmm0
+#define xvec1 %xmm1
+#define xvec2 %xmm2
+#define xvec3 %xmm3
+#define xvec4 %xmm4
+#define xvec5 %xmm5
+#define xvec6 %xmm6
+#define xvec7 %xmm7
+#define xvec8 %xmm8
+#define xvec9 %xmm9
+#define xvec10 %xmm10
+#define xvec11 %xmm11
+#define xvec12 %xmm12
+#define xvec13 %xmm13
+#define xvec14 %xmm14
+#define xvec15 %xmm15
+
+#define yvec0 %ymm0
+#define yvec1 %ymm1
+#define yvec2 %ymm2
+#define yvec3 %ymm3
+#define yvec4 %ymm4
+#define yvec5 %ymm5
+#define yvec6 %ymm6
+#define yvec7 %ymm7
+#define yvec8 %ymm8
+#define yvec9 %ymm9
+#define yvec10 %ymm10
+#define yvec11 %ymm11
+#define yvec12 %ymm12
+#define yvec13 %ymm13
+#define yvec14 %ymm14
+#define yvec15 %ymm15
+
+#define LEAQ leaq
+#define ADDQ addq
+#define MULQ imulq
+#define SARQ sarq
+#define SALQ salq
+#define ANDQ andq
+#define SUBQ subq
+#define DECQ decq
+#define JG jg
+#define JLE jle
+#define TEST testq
+#define OR orq
+#define JNE jne
+#define JMP jmp
+#define NOP
+#define XOR xorpd
+#define MOVQ movq
+
+#define XOR_SY vxorps
+#define XOR_DY vxorpd
+#define XOR_SX xorps
+#define XOR_DX xorpd
+
+#define LD_SY vmovaps
+#define LD_DY vmovapd
+#define LD_SX movaps
+#define LD_DX movapd
+#define LDL_SX movlps
+#define LDL_SY vmovlps
+#define LDH_SX movhps
+#define LDH_SY vmovhps
+
+#define ST_SY vmovaps
+#define ST_DY vmovapd
+#define ST_SX movaps
+#define ST_DX movapd
+#define STL_SX movlps
+#define STL_SY vmovlps
+#define STH_SX movhps
+#define STH_SY vmovhps
+
+#define EDUP_SY vmovsldup
+#define ODUP_SY vmovshdup
+#define EDUP_SX movsldup
+#define ODUP_SX movshdup
+#define EDUP_DY vmovddup
+
+#define ADD_SY vaddps
+#define ADD_DY vaddpd
+#define ADD_SX addps
+#define ADD_DX addpd
+#define SUB_DY vsubpd
+#define SUB_SY vsubps
+#define SUB_DX subpd
+#define SUB_SX subps
+
+#define ADDSUB_DY vaddsubpd
+#define ADDSUB_DX addsubpd
+#define ADDSUB_SY vaddsubps
+#define ADDSUB_SX addsubps
+
+#define MUL_SY vmulps
+#define MUL_DY vmulpd
+#define MUL_SX mulps
+#define MUL_DX mulpd
+
+#define SHUF_SY vperm2f128
+#define SHUF_DY vperm2f128
+#define SHUF_DX pshufd
+#define SHUF_SX pshufd
+
+#define VPERMILP_SY vpermilps
+#define VPERMILP_SX vpermilps
+#define VPERMILP_DY vpermilpd
+
+#define BROAD_SY vbroadcastss
+#define BROAD_DY vbroadcastsd
+#define BROAD_SX vbroadcastss
+#define BROAD_DX movddup
+
+#define MOV_SY vmovaps
+#define MOV_DY vmovapd
+#define MOV_SX movaps
+#define MOV_DX movapd
+
+#define REVS_SY vshufps
+#define REVS_DY vshufpd
+#define REVS_SX shufps
+#define REVS_DX movsd
+
+#define EXTRA_SY vextractf128
+#define EXTRA_DY vextractf128
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define ADD1_SY ADD_SY
+#define ADD2_SY ADDSUB_SY
+#define ADD1_SX ADD_SX
+#define ADD2_SX ADDSUB_SX
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define ADD1_SY SUB_SY
+#define ADD2_SY ADDSUB_SY
+#define ADD1_SX SUB_SX
+#define ADD2_SX ADDSUB_SX
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define ADD1_SY SUB_SY
+#define ADD2_SY ADDSUB_SY
+#define ADD1_SX SUB_SX
+#define ADD2_SX ADDSUB_SX
+#else
+#define ADD1_SY ADD_SY
+#define ADD2_SY ADDSUB_SY
+#define ADD1_SX ADD_SX
+#define ADD2_SX ADDSUB_SX
+#endif
+
+PROLOGUE
+
+subq $STACKSIZE, %rsp;
+movq %rbx, 0(%rsp);
+movq %rbp, 8(%rsp);
+movq %r12, 16(%rsp);
+movq %r13, 24(%rsp);
+movq %r14, 32(%rsp);
+movq %r15, 40(%rsp);
+
+#ifdef WINDOWS_ABI
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ movups %xmm6, 64(%rsp)
+ movups %xmm7, 80(%rsp)
+ movups %xmm8, 96(%rsp)
+ movups %xmm9, 112(%rsp)
+ movups %xmm10, 128(%rsp)
+ movups %xmm11, 144(%rsp)
+ movups %xmm12, 160(%rsp)
+ movups %xmm13, 176(%rsp)
+ movups %xmm14, 192(%rsp)
+ movups %xmm15, 208(%rsp)
+
+ movq ARG1, old_bm
+ movq ARG2, old_bn
+ movq ARG3, old_bk
+ movq OLD_A, ba
+ movq OLD_B, bb
+ movq OLD_C, C
+ movq old_ldc, ldc
+#ifdef TRMMKERNEL
+ movq old_offset, %r11
+#endif
+ movaps %xmm3, %xmm0
+ movsd OLD_ALPHA_I, %xmm1
+#else
+movq old_ldc, ldc
+#ifdef TRMMKERNEL
+movq old_offset, %r11;
+#endif
+#endif
+
+vmovlps %xmm0, MEMALPHA_R
+vmovlps %xmm1, MEMALPHA_I
+movq old_bm, bm
+movq old_bn, bn
+movq old_bk, bk
+salq $ZBASE_SHIFT, ldc
+#ifdef TRMMKERNEL
+movq %r11, OFFSET
+#ifndef LEFT
+negq %r11;
+#endif
+movq %r11, kk;
+#endif
+
+MOVQ bn,j;
+SARQ $2,j; # Rn = 4
+JLE .L0_loopE;
+.align 32;
+.L0_bodyB:;
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C,C0;
+LEAQ (C,ldc,2),C1;
+MOVQ bk, k;
+SALQ $5, k;
+LEAQ (bb, k, 1), prebb; # Rn=4, SIZE=4 COMPLEX=2
+MOVQ ba,ptrba;
+MOVQ bm,i;
+SARQ $3,i; # Rm = 8
+JLE .L1_loopE;
+.align 32;
+.L1_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+# Initial results register
+PREFETCH0 0*SIZE(prebb);
+XOR_SY yvec15, yvec15, yvec15;
+PREFETCH0 16*SIZE(prebb);
+ADDQ $32*SIZE, prebb;
+XOR_SY yvec14, yvec14, yvec14;
+PREFETCH2 3*SIZE(C0);
+XOR_SY yvec13, yvec13, yvec13;
+PREFETCH2 3*SIZE(C0, ldc, 1);
+XOR_SY yvec12, yvec12, yvec12;
+PREFETCH2 3*SIZE(C1);
+EDUP_SY 0*SIZE(ptrbb), yvec2; # Br0, Br1, Br2, Br3
+PREFETCH2 3*SIZE(C1, ldc, 1);
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec10, yvec10, yvec10;
+LD_SY 0*SIZE(ptrba), yvec0; # Ar0, Ai0, Ar1, Ai1..
+XOR_SY yvec9, yvec9, yvec9;
+XOR_SY yvec8, yvec8, yvec8;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Br2, Br3, Br0, Br1
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2,k; # Unroll 4 times
+JLE .L2_loopE;
+.align 32;
+.L2_bodyB:;
+# Computing kernel
+
+######### Unroll 1 ##################
+PREFETCH0 PRESIZE*SIZE(ptrba);
+LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec1, yvec5, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+EDUP_SY 8*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec14, yvec14;
+ADD2_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+LD_SY 16*SIZE(ptrba), yvec0;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+
+######### Unroll 2 ##################
+PREFETCH0 (PRESIZE+16)*SIZE(ptrba);
+LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec1, yvec5, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+EDUP_SY 16*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec14, yvec14;
+ADD2_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+LD_SY 32*SIZE(ptrba), yvec0;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+
+######### Unroll 3 ##################
+PREFETCH0 (PRESIZE+32)*SIZE(ptrba);
+LD_SY 40*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+ODUP_SY 16*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec1, yvec5, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+EDUP_SY 24*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec14, yvec14;
+ADD2_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+LD_SY 48*SIZE(ptrba), yvec0;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+
+######### Unroll 4 ##################
+PREFETCH0 (PRESIZE+48)*SIZE(ptrba);
+LD_SY 56*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADDQ $64*SIZE, ptrba;
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+ODUP_SY 24*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADDQ $32*SIZE, ptrbb;
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+MUL_SY yvec1, yvec5, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec14, yvec14;
+ADD2_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+LD_SY 0*SIZE(ptrba), yvec0;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+.L2_bodyE:;
+DECQ k;
+JG .L2_bodyB;
+.align 64;
+.L2_loopE:;
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L3_loopE;
+.align 64
+.L3_loopB:
+######### Unroll 1 ##################
+PREFETCH0 PRESIZE*SIZE(ptrba)
+LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD2_SY yvec6, yvec14, yvec14;
+ADD2_SY yvec7, yvec12, yvec12;
+
+EDUP_SY 8*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+LD_SY 16*SIZE(ptrba), yvec0;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+
+######### Unroll 2 ##################
+PREFETCH0 (PRESIZE+16)*SIZE(ptrba)
+LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+ADDQ $32*SIZE, ptrba
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADDQ $16*SIZE, ptrbb;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD2_SY yvec6, yvec14, yvec14;
+ADD2_SY yvec7, yvec12, yvec12;
+
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+LD_SY 0*SIZE(ptrba), yvec0;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+.L3_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L4_loopE;
+.align 64
+.L4_loopB:;
+######### Unroll 1 ##################
+PREFETCH0 PRESIZE*SIZE(ptrba)
+LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5..
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+ADDQ $16*SIZE, ptrba;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+ADD1_SY yvec6, yvec15, yvec15;
+ADD1_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+ADD1_SY yvec6, yvec14, yvec14;
+ADD1_SY yvec7, yvec12, yvec12;
+
+ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADDQ $8*SIZE, ptrbb;
+VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1
+ADD1_SY yvec6, yvec11, yvec11;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1..
+ADD1_SY yvec6, yvec10, yvec10;
+ADD1_SY yvec7, yvec8, yvec8;
+
+VPERMILP_SY $0xb1, yvec1, yvec1;
+MUL_SY yvec0, yvec2, yvec6;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2
+ADD2_SY yvec6, yvec15, yvec15;
+ADD2_SY yvec7, yvec13, yvec13;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec14, yvec14;
+SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec12, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+ADD2_SY yvec6, yvec11, yvec11;
+ADD2_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec6, yvec10, yvec10;
+ADD2_SY yvec7, yvec8, yvec8;
+
+.L4_loopE:;
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_SY yvec15, yvec7, yvec15;
+ADDSUB_SY yvec14, yvec7, yvec14;
+ADDSUB_SY yvec13, yvec7, yvec13;
+ADDSUB_SY yvec12, yvec7, yvec12;
+ADDSUB_SY yvec11, yvec7, yvec11;
+ADDSUB_SY yvec10, yvec7, yvec10;
+ADDSUB_SY yvec9, yvec7, yvec9;
+ADDSUB_SY yvec8, yvec7, yvec8;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_SY yvec15, yvec7, yvec15;
+SUB_SY yvec14, yvec7, yvec14;
+SUB_SY yvec13, yvec7, yvec13;
+SUB_SY yvec12, yvec7, yvec12;
+SUB_SY yvec11, yvec7, yvec11;
+SUB_SY yvec10, yvec7, yvec10;
+SUB_SY yvec9, yvec7, yvec9;
+SUB_SY yvec8, yvec7, yvec8;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_SY $0xb1, yvec15, yvec15;
+VPERMILP_SY $0xb1, yvec14, yvec14;
+VPERMILP_SY $0xb1, yvec13, yvec13;
+VPERMILP_SY $0xb1, yvec12, yvec12;
+VPERMILP_SY $0xb1, yvec11, yvec11;
+VPERMILP_SY $0xb1, yvec10, yvec10;
+VPERMILP_SY $0xb1, yvec9, yvec9;
+VPERMILP_SY $0xb1, yvec8, yvec8;
+ADDSUB_SY yvec15, yvec7, yvec15;
+ADDSUB_SY yvec14, yvec7, yvec14;
+ADDSUB_SY yvec13, yvec7, yvec13;
+ADDSUB_SY yvec12, yvec7, yvec12;
+ADDSUB_SY yvec11, yvec7, yvec11;
+ADDSUB_SY yvec10, yvec7, yvec10;
+ADDSUB_SY yvec9, yvec7, yvec9;
+ADDSUB_SY yvec8, yvec7, yvec8;
+VPERMILP_SY $0xb1, yvec15, yvec15;
+VPERMILP_SY $0xb1, yvec14, yvec14;
+VPERMILP_SY $0xb1, yvec13, yvec13;
+VPERMILP_SY $0xb1, yvec12, yvec12;
+VPERMILP_SY $0xb1, yvec11, yvec11;
+VPERMILP_SY $0xb1, yvec10, yvec10;
+VPERMILP_SY $0xb1, yvec9, yvec9;
+VPERMILP_SY $0xb1, yvec8, yvec8;
+#endif
+##### Load Alpha ####
+BROAD_SY MEMALPHA_R,yvec7;
+BROAD_SY MEMALPHA_I,yvec6;
+##### Multiply Alpha ####
+VPERMILP_SY $0xb1,yvec15, yvec5;
+MUL_SY yvec15, yvec7, yvec15;
+MUL_SY yvec5, yvec6, yvec5;
+ADDSUB_SY yvec5, yvec15, yvec15;
+VPERMILP_SY $0xb1,yvec14, yvec4;
+MUL_SY yvec14, yvec7, yvec14;
+MUL_SY yvec4, yvec6, yvec4;
+ADDSUB_SY yvec4, yvec14, yvec14;
+VPERMILP_SY $0xb1,yvec13, yvec3;
+MUL_SY yvec13, yvec7, yvec13;
+MUL_SY yvec3, yvec6, yvec3;
+ADDSUB_SY yvec3, yvec13, yvec13;
+VPERMILP_SY $0xb1,yvec12, yvec2;
+MUL_SY yvec12, yvec7, yvec12;
+MUL_SY yvec2, yvec6, yvec2;
+ADDSUB_SY yvec2, yvec12, yvec12;
+VPERMILP_SY $0xb1,yvec11, yvec1;
+MUL_SY yvec11, yvec7, yvec11;
+MUL_SY yvec1, yvec6, yvec1;
+ADDSUB_SY yvec1, yvec11, yvec11;
+VPERMILP_SY $0xb1,yvec10, yvec0;
+MUL_SY yvec10, yvec7, yvec10;
+MUL_SY yvec0, yvec6, yvec0;
+ADDSUB_SY yvec0, yvec10, yvec10;
+VPERMILP_SY $0xb1,yvec9, yvec5;
+MUL_SY yvec9, yvec7, yvec9;
+MUL_SY yvec5, yvec6, yvec5;
+ADDSUB_SY yvec5, yvec9, yvec9;
+VPERMILP_SY $0xb1,yvec8, yvec4;
+MUL_SY yvec8, yvec7, yvec8;
+MUL_SY yvec4, yvec6, yvec4;
+ADDSUB_SY yvec4, yvec8, yvec8;
+#### Shuffle Results ####
+MOV_SY yvec15,yvec7;
+REVS_SY $0xe4,yvec13,yvec15,yvec15;
+REVS_SY $0xe4,yvec7,yvec13,yvec13;
+MOV_SY yvec14,yvec7;
+REVS_SY $0xe4,yvec12,yvec14,yvec14;
+REVS_SY $0xe4,yvec7,yvec12,yvec12;
+MOV_SY yvec11,yvec7;
+REVS_SY $0xe4,yvec9,yvec11,yvec11;
+REVS_SY $0xe4,yvec7,yvec9,yvec9;
+MOV_SY yvec10,yvec7;
+REVS_SY $0xe4,yvec8,yvec10,yvec10;
+REVS_SY $0xe4,yvec7,yvec8,yvec8;
+#### Store Back ####
+#### Testing alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L4_loopEx;
+.align 32
+EXTRA_SY $1,yvec15,xvec7;
+EXTRA_SY $1,yvec14,xvec6;
+EXTRA_SY $1,yvec13,xvec5;
+EXTRA_SY $1,yvec12,xvec4;
+EXTRA_SY $1,yvec11,xvec3;
+EXTRA_SY $1,yvec10,xvec2;
+EXTRA_SY $1,yvec9,xvec1;
+EXTRA_SY $1,yvec8,xvec0;
+#ifndef TRMMKERNEL
+ADD_SY 0*SIZE(C0),xvec15, xvec15;
+ADD_SY 4*SIZE(C1),xvec7, xvec7;
+ADD_SY 8*SIZE(C0),xvec14, xvec14;
+ADD_SY 12*SIZE(C1),xvec6, xvec6;
+ADD_SY 0*SIZE(C0,ldc,1),xvec13, xvec13;
+ADD_SY 4*SIZE(C1,ldc,1),xvec5, xvec5;
+ADD_SY 8*SIZE(C0,ldc,1),xvec12, xvec12;
+ADD_SY 12*SIZE(C1,ldc,1),xvec4, xvec4;
+ADD_SY 0*SIZE(C1),xvec11, xvec11;
+ADD_SY 4*SIZE(C0),xvec3, xvec3;
+ADD_SY 8*SIZE(C1),xvec10, xvec10;
+ADD_SY 12*SIZE(C0),xvec2, xvec2;
+ADD_SY 0*SIZE(C1,ldc,1),xvec9, xvec9;
+ADD_SY 4*SIZE(C0,ldc,1),xvec1, xvec1;
+ADD_SY 8*SIZE(C1,ldc,1),xvec8, xvec8;
+ADD_SY 12*SIZE(C0,ldc,1),xvec0, xvec0;
+#endif
+ST_SY xvec15,0*SIZE(C0);
+ST_SY xvec7,4*SIZE(C1);
+ST_SY xvec14,8*SIZE(C0);
+ST_SY xvec6,12*SIZE(C1);
+ST_SY xvec13,0*SIZE(C0,ldc,1);
+ST_SY xvec5,4*SIZE(C1,ldc,1);
+ST_SY xvec12,8*SIZE(C0,ldc,1);
+ST_SY xvec4,12*SIZE(C1,ldc,1);
+ST_SY xvec11,0*SIZE(C1);
+ST_SY xvec3,4*SIZE(C0);
+ST_SY xvec10,8*SIZE(C1);
+ST_SY xvec2,12*SIZE(C0);
+ST_SY xvec9,0*SIZE(C1,ldc,1);
+ST_SY xvec1,4*SIZE(C0,ldc,1);
+ST_SY xvec8,8*SIZE(C1,ldc,1);
+ST_SY xvec0,12*SIZE(C0,ldc,1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk;
+#endif
+ADDQ $16*SIZE,C0;
+ADDQ $16*SIZE,C1;
+.L1_bodyE:;
+DECQ i;
+JG .L1_bodyB;
+JMP .L1_loopE;
+.align 32
+.L4_loopEx:
+EXTRA_SY $1, yvec15, xvec7;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C0), xvec6, xvec6;
+LDH_SY 2*SIZE(C0), xvec6, xvec6;
+ADD_SY xvec6, xvec15, xvec15;
+#endif
+STL_SY xvec15, 0*SIZE(C0);
+STH_SY xvec15, 2*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C1), xvec5, xvec5;
+LDH_SY 6*SIZE(C1), xvec5, xvec5;
+ADD_SY xvec5, xvec7, xvec7;
+#endif
+STL_SY xvec7, 4*SIZE(C1);
+STH_SY xvec7, 6*SIZE(C1);
+
+EXTRA_SY $1, yvec14, xvec6;
+#ifndef TRMMKERNEL
+LDL_SY 8*SIZE(C0), xvec5, xvec5;
+LDH_SY 10*SIZE(C0), xvec5, xvec5;
+ADD_SY xvec5, xvec14, xvec14;
+#endif
+STL_SY xvec14, 8*SIZE(C0);
+STH_SY xvec14, 10*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_SY 12*SIZE(C1), xvec4, xvec4;
+LDH_SY 14*SIZE(C1), xvec4, xvec4;
+ADD_SY xvec4, xvec6, xvec6;
+#endif
+STL_SY xvec6, 12*SIZE(C1);
+STH_SY xvec6, 14*SIZE(C1);
+
+EXTRA_SY $1, yvec13, xvec5;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C0, ldc, 1), xvec4, xvec4;
+LDH_SY 2*SIZE(C0, ldc, 1), xvec4, xvec4;
+ADD_SY xvec4, xvec13, xvec13;
+#endif
+STL_SY xvec13, 0*SIZE(C0, ldc, 1);
+STH_SY xvec13, 2*SIZE(C0, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C1, ldc, 1), xvec3, xvec3;
+LDH_SY 6*SIZE(C1, ldc, 1), xvec3, xvec3;
+ADD_SY xvec3, xvec5, xvec5;
+#endif
+STL_SY xvec5, 4*SIZE(C1, ldc, 1);
+STH_SX xvec5, 6*SIZE(C1, ldc, 1);
+
+EXTRA_SY $1, yvec12, xvec4;
+#ifndef TRMMKERNEL
+LDL_SY 8*SIZE(C0, ldc, 1), xvec3, xvec3;
+LDH_SY 10*SIZE(C0, ldc, 1), xvec3, xvec3;
+ADD_SY xvec3, xvec12, xvec12;
+#endif
+STL_SY xvec12, 8*SIZE(C0, ldc, 1);
+STH_SY xvec12, 10*SIZE(C0, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SY 12*SIZE(C1, ldc, 1), xvec2, xvec2;
+LDH_SY 14*SIZE(C1, ldc, 1), xvec2, xvec2;
+ADD_SY xvec2, xvec4, xvec4;
+#endif
+STL_SY xvec4, 12*SIZE(C1, ldc, 1);
+STH_SY xvec4, 14*SIZE(C1, ldc, 1);
+
+EXTRA_SY $1, yvec11, xvec3;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C1), xvec2, xvec2;
+LDH_SY 2*SIZE(C1), xvec2, xvec2;
+ADD_SY xvec2, xvec11, xvec11;
+#endif
+STL_SY xvec11, 0*SIZE(C1);
+STH_SY xvec11, 2*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C0), xvec1, xvec1;
+LDH_SY 6*SIZE(C0), xvec1, xvec1;
+ADD_SY xvec1, xvec3, xvec3;
+#endif
+STL_SY xvec3, 4*SIZE(C0);
+STH_SY xvec3, 6*SIZE(C0);
+
+EXTRA_SY $1, yvec10, xvec2;
+#ifndef TRMMKERNEL
+LDL_SY 8*SIZE(C1), xvec1, xvec1;
+LDH_SY 10*SIZE(C1), xvec1, xvec1;
+ADD_SY xvec1, xvec10, xvec10;
+#endif
+STL_SY xvec10, 8*SIZE(C1);
+STH_SY xvec10, 10*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SY 12*SIZE(C0), xvec0, xvec0;
+LDH_SY 14*SIZE(C0), xvec0, xvec0;
+ADD_SY xvec0, xvec2, xvec2;
+#endif
+STL_SY xvec2, 12*SIZE(C0);
+STH_SY xvec2, 14*SIZE(C0);
+
+EXTRA_SY $1, yvec9, xvec1;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C1, ldc, 1), xvec7, xvec7;
+LDH_SY 2*SIZE(C1, ldc, 1), xvec7, xvec7;
+ADD_SY xvec7, xvec9, xvec9;
+#endif
+STL_SY xvec9, 0*SIZE(C1, ldc, 1);
+STH_SY xvec9, 2*SIZE(C1, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C0, ldc, 1), xvec6, xvec6;
+LDH_SY 6*SIZE(C0, ldc, 1), xvec6, xvec6;
+ADD_SY xvec6, xvec1, xvec1;
+#endif
+STL_SY xvec1, 4*SIZE(C0, ldc, 1);
+STH_SY xvec1, 6*SIZE(C0, ldc, 1);
+
+EXTRA_SY $1, yvec8, xvec0;
+#ifndef TRMMKERNEL
+LDL_SY 8*SIZE(C1, ldc, 1), xvec6, xvec6;
+LDH_SY 10*SIZE(C1, ldc, 1), xvec6, xvec6;
+ADD_SY xvec6, xvec8, xvec8;
+#endif
+STL_SY xvec8, 8*SIZE(C1, ldc, 1);
+STH_SY xvec8, 10*SIZE(C1, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SY 12*SIZE(C0, ldc, 1), xvec5, xvec5;
+LDH_SY 14*SIZE(C0, ldc, 1), xvec5, xvec5;
+ADD_SY xvec5, xvec0, xvec0;
+#endif
+STL_SY xvec0, 12*SIZE(C0, ldc, 1);
+STH_SY xvec0, 14*SIZE(C0, ldc, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk;
+#endif
+ADDQ $16*SIZE, C0;
+ADDQ $16*SIZE, C1;
+DECQ i;
+JG .L1_bodyB;
+.align 32;
+.L1_loopE:;
+TEST $4, bm;
+JLE .L5_loopE;
+.align 32
+.L5_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec9, yvec9, yvec9;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L8_loopE;
+.align 32
+.L8_bodyB:
+#### Unroll times 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 0*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+
+#### Unroll time 2 ####
+LD_SY 8*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 8*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 8*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+
+#### Unroll time 3 ####
+LD_SY 16*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 16*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 16*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+
+#### Unroll time 3 ####
+LD_SY 24*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 24*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 24*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+ADDQ $32*SIZE, ptrba;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L8_bodyB;
+.align 32
+.L8_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L9_loopE;
+.align 32
+.L9_bodyB:
+#### Unroll times 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 0*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+
+#### Unroll time 2 ####
+LD_SY 8*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 8*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 8*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+ADDQ $16*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+.L9_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L10_loopE;
+.align 32
+.L10_bodyB:
+#### Unroll times 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+VPERMILP_SY $0xb1, yvec0, yvec1;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+ADD1_SY yvec7, yvec13, yvec13;
+
+ODUP_SY 0*SIZE(ptrbb), yvec2;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec0, yvec4, yvec6;
+ADD1_SY yvec6, yvec11, yvec11;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec5, yvec7;
+ADD1_SY yvec7, yvec9, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec1, yvec3, yvec7;
+ADD2_SY yvec7, yvec13, yvec13;
+
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+MUL_SY yvec1, yvec4, yvec6;
+ADD2_SY yvec6, yvec11, yvec11;
+MUL_SY yvec1, yvec5, yvec7;
+ADD2_SY yvec7, yvec9, yvec9;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L10_loopE:
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_SY yvec15, yvec7, yvec15;
+ADDSUB_SY yvec13, yvec7, yvec13;
+ADDSUB_SY yvec11, yvec7, yvec11;
+ADDSUB_SY yvec9, yvec7, yvec9;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_SY yvec15, yvec7, yvec15;
+SUB_SY yvec13, yvec7, yvec13;
+SUB_SY yvec11, yvec7, yvec11;
+SUB_SY yvec9, yvec7, yvec9;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_SY $0xb1, yvec15, yvec15;
+VPERMILP_SY $0xb1, yvec13, yvec13;
+VPERMILP_SY $0xb1, yvec11, yvec11;
+VPERMILP_SY $0xb1, yvec9, yvec9;
+ADDSUB_SY yvec15, yvec7, yvec15;
+ADDSUB_SY yvec13, yvec7, yvec13;
+ADDSUB_SY yvec11, yvec7, yvec11;
+ADDSUB_SY yvec9, yvec7, yvec9;
+VPERMILP_SY $0xb1, yvec15, yvec15;
+VPERMILP_SY $0xb1, yvec13, yvec13;
+VPERMILP_SY $0xb1, yvec11, yvec11;
+VPERMILP_SY $0xb1, yvec9, yvec9;
+#endif
+##### Load Alpha ####
+BROAD_SY MEMALPHA_R,yvec7;
+BROAD_SY MEMALPHA_I,yvec6;
+##### Multiply Alpha ####
+VPERMILP_SY $0xb1,yvec15, yvec5;
+MUL_SY yvec15, yvec7, yvec15;
+MUL_SY yvec5, yvec6, yvec5;
+ADDSUB_SY yvec5, yvec15, yvec15;
+VPERMILP_SY $0xb1,yvec13, yvec3;
+MUL_SY yvec13, yvec7, yvec13;
+MUL_SY yvec3, yvec6, yvec3;
+ADDSUB_SY yvec3, yvec13, yvec13;
+VPERMILP_SY $0xb1,yvec11, yvec1;
+MUL_SY yvec11, yvec7, yvec11;
+MUL_SY yvec1, yvec6, yvec1;
+ADDSUB_SY yvec1, yvec11, yvec11;
+VPERMILP_SY $0xb1,yvec9, yvec5;
+MUL_SY yvec9, yvec7, yvec9;
+MUL_SY yvec5, yvec6, yvec5;
+ADDSUB_SY yvec5, yvec9, yvec9;
+#### Writing back ####
+#### Shuffle Results ####
+MOV_SY yvec15,yvec7;
+REVS_SY $0xe4,yvec13,yvec15,yvec15;
+REVS_SY $0xe4,yvec7,yvec13,yvec13;
+MOV_SY yvec11,yvec7;
+REVS_SY $0xe4,yvec9,yvec11,yvec11;
+REVS_SY $0xe4,yvec7,yvec9,yvec9;
+#### Writing back ####
+EXTRA_SY $1, yvec15, xvec7;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec6;
+LDH_SX 2*SIZE(C0), xvec6;
+ADD_SX xvec6, xvec15;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_SX 4*SIZE(C1), xvec4;
+LDH_SX 6*SIZE(C1), xvec4;
+ADD_SX xvec4, xvec7;
+#endif
+STL_SX xvec7, 4*SIZE(C1);
+STH_SX xvec7, 6*SIZE(C1);
+
+EXTRA_SY $1, yvec13, xvec5;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0, ldc, 1), xvec4;
+LDH_SX 2*SIZE(C0, ldc, 1), xvec4;
+ADD_SX xvec4, xvec13;
+#endif
+STL_SX xvec13, 0*SIZE(C0, ldc, 1);
+STH_SX xvec13, 2*SIZE(C0, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SX 4*SIZE(C1, ldc, 1), xvec2;
+LDH_SX 6*SIZE(C1, ldc, 1), xvec2;
+ADD_SX xvec2, xvec5;
+#endif
+STL_SX xvec5, 4*SIZE(C1, ldc, 1);
+STH_SX xvec5, 6*SIZE(C1, ldc, 1);
+
+EXTRA_SY $1, yvec11, xvec3;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C1), xvec2;
+LDH_SX 2*SIZE(C1), xvec2;
+ADD_SX xvec2, xvec11;
+#endif
+STL_SX xvec11, 0*SIZE(C1);
+STH_SX xvec11, 2*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SX 4*SIZE(C0), xvec0;
+LDH_SX 6*SIZE(C0), xvec0;
+ADD_SX xvec0, xvec3;
+#endif
+STL_SX xvec3, 4*SIZE(C0);
+STH_SX xvec3, 6*SIZE(C0);
+
+EXTRA_SY $1, yvec9, xvec1;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C1, ldc, 1), xvec0;
+LDH_SX 2*SIZE(C1, ldc, 1), xvec0;
+ADD_SX xvec0, xvec9;
+#endif
+STL_SX xvec9, 0*SIZE(C1, ldc, 1);
+STH_SX xvec9, 2*SIZE(C1, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SX 4*SIZE(C0, ldc, 1), xvec6;
+LDH_SX 6*SIZE(C0, ldc, 1), xvec6;
+ADD_SX xvec6, xvec1;
+#endif
+STL_SX xvec1, 4*SIZE(C0, ldc, 1);
+STH_SX xvec1, 6*SIZE(C0, ldc, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+
+.L5_loopE:
+TEST $2, bm;
+JLE .L6_loopE;
+.align 32
+.L6_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L11_loopE;
+.align 32
+.L11_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 8*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 12*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+
+LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 20*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 16*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 20*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+
+LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 28*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 24*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 28*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+ADDQ $16*SIZE, ptrba;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L11_bodyB;
+.align 32
+.L11_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L12_loopE;
+.align 32
+.L12_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 8*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 12*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+ADDQ $8*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+.L12_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L13_loopE;
+.align 32
+.L13_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2
+EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2
+SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD1_SX xvec5, xvec12;
+
+SHUF_SX $0xb1, xvec0, xvec1;
+ODUP_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x4e, xvec2, xvec3;
+MUL_SX xvec1, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec14;
+
+ODUP_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec12;
+ADDQ $4*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L13_loopE:
+#### Handle ####
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec13, xvec7;
+MOV_SX xvec7, xvec13;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec12, xvec7;
+MOV_SX xvec7, xvec12;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec13, xvec7;
+MOV_SX xvec7, xvec13;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec12, xvec7;
+MOV_SX xvec7, xvec12;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+SHUF_SX $0xb1, xvec13, xvec13;
+SHUF_SX $0xb1, xvec12, xvec12;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec13, xvec7;
+MOV_SX xvec7, xvec13;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec12, xvec7;
+MOV_SX xvec7, xvec12;
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+SHUF_SX $0xb1, xvec13, xvec13;
+SHUF_SX $0xb1, xvec12, xvec12;
+#endif
+##### Load Alpha ####
+BROAD_SX MEMALPHA_R,xvec7;
+BROAD_SX MEMALPHA_I,xvec6;
+##### Multiply Alpha ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+VPERMILP_SX $0xb1,xvec14, xvec4;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec6, xvec4;
+ADDSUB_SX xvec4, xvec14;
+VPERMILP_SX $0xb1,xvec13, xvec3;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec6, xvec3;
+ADDSUB_SX xvec3, xvec13;
+VPERMILP_SX $0xb1,xvec12, xvec2;
+MUL_SX xvec7, xvec12;
+MUL_SX xvec6, xvec2;
+ADDSUB_SX xvec2, xvec12;
+#### Writing back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C0, ldc, 1), xvec0;
+LDL_SX 0*SIZE(C0, ldc, 1), xvec1;
+LDH_SX 2*SIZE(C0), xvec1;
+LDL_SX 0*SIZE(C1), xvec2;
+LDH_SX 2*SIZE(C1, ldc, 1), xvec2;
+LDL_SX 0*SIZE(C1, ldc, 1), xvec3;
+LDH_SX 2*SIZE(C1), xvec3;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+ADD_SX xvec2, xvec13;
+ADD_SX xvec3, xvec12;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0, ldc, 1);
+STL_SX xvec14, 0*SIZE(C0, ldc, 1);
+STH_SX xvec14, 2*SIZE(C0);
+STL_SX xvec13, 0*SIZE(C1);
+STH_SX xvec13, 2*SIZE(C1, ldc, 1);
+STL_SX xvec12, 0*SIZE(C1, ldc, 1);
+STH_SX xvec12, 2*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+
+.L6_loopE:
+TEST $1, bm;
+JLE .L7_loopE;
+.align 32
+.L7_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L14_loopE;
+.align 32
+.L14_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+
+BROAD_SX 2*SIZE(ptrba), xvec0;
+LD_SX 8*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 12*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 3*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+
+BROAD_SX 4*SIZE(ptrba), xvec0;
+LD_SX 16*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 20*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 5*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+
+BROAD_SX 6*SIZE(ptrba), xvec0;
+LD_SX 24*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 28*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 7*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+ADDQ $8*SIZE, ptrba;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L14_bodyB;
+.align 32
+.L14_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L15_loopE;
+.align 32
+.L15_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+
+BROAD_SX 2*SIZE(ptrba), xvec0;
+LD_SX 8*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 12*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 3*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+ADDQ $4*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+.L15_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L16_loopE;
+.align 32
+.L16_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+LD_SX 4*SIZE(ptrbb), xvec4;
+SHUF_SX $0xb1, xvec4, xvec5;
+MUL_SX xvec0, xvec4;
+ADD1_SX xvec4, xvec14;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+MUL_SX xvec1, xvec5;
+ADD2_SX xvec5, xvec14;
+ADDQ $2*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L16_loopE:
+#### Handle ####
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+#endif
+##### Load Alpha ####
+BROAD_SX MEMALPHA_R,xvec7;
+BROAD_SX MEMALPHA_I,xvec6;
+##### Multiply Alpha ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+VPERMILP_SX $0xb1,xvec14, xvec4;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec6, xvec4;
+ADDSUB_SX xvec4, xvec14;
+#### Writing back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 0*SIZE(C0, ldc, 1), xvec0;
+LDL_SX 0*SIZE(C1), xvec1;
+LDH_SX 0*SIZE(C1, ldc, 1), xvec1;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 0*SIZE(C0, ldc, 1);
+STL_SX xvec14, 0*SIZE(C1);
+STH_SX xvec14, 0*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+.L7_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $4, kk;
+#endif
+MOVQ bk,k;
+SALQ $5,k;
+ADDQ k,bb;
+LEAQ (C,ldc,4),C;
+.L0_bodyE:;
+DECQ j;
+JG .L0_bodyB;
+.align 32;
+.L0_loopE:;
+TEST $2, bn;
+JLE .L20_loopE;
+.align 32
+.L20_bodyB:
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C, C0;
+LEAQ (C, ldc, 1), C1;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i;
+JLE .L21_loopE;
+.align 32
+.L21_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec10, yvec10, yvec10;
+XOR_SY yvec9, yvec9, yvec9;
+XOR_SY yvec8, yvec8, yvec8;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L211_loopE;
+.align 32
+.L211_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 16*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 20*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 24*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 28*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+
+EDUP_SX 8*SIZE(ptrbb), xvec4;
+ODUP_SX 8*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 32*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 36*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 40*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 44*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+ODUP_SX 12*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 48*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 52*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 56*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 60*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+ADDQ $64*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L211_bodyB;
+.align 32
+.L211_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L212_loopE;
+.align 32
+.L212_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 16*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 20*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 24*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 28*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+ADDQ $32*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L212_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L213_loopE;
+.align 32
+.L213_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec13;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec9;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec13;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec9;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec12;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec8;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec12;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec8;
+ADDQ $16*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb
+
+.L213_loopE:
+#### Handle ####
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec13, xvec7;
+MOV_SX xvec7, xvec13;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec12, xvec7;
+MOV_SX xvec7, xvec12;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec10, xvec7;
+MOV_SX xvec7, xvec10;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec9, xvec7;
+MOV_SX xvec7, xvec9;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec8, xvec7;
+MOV_SX xvec7, xvec8;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec13, xvec7;
+MOV_SX xvec7, xvec13;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec12, xvec7;
+MOV_SX xvec7, xvec12;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec10, xvec7;
+MOV_SX xvec7, xvec10;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec9, xvec7;
+MOV_SX xvec7, xvec9;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec8, xvec7;
+MOV_SX xvec7, xvec8;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+SHUF_SX $0xb1, xvec13, xvec13;
+SHUF_SX $0xb1, xvec12, xvec12;
+SHUF_SX $0xb1, xvec11, xvec11;
+SHUF_SX $0xb1, xvec10, xvec10;
+SHUF_SX $0xb1, xvec9, xvec9;
+SHUF_SX $0xb1, xvec8, xvec8;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec13, xvec7;
+MOV_SX xvec7, xvec13;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec12, xvec7;
+MOV_SX xvec7, xvec12;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec10, xvec7;
+MOV_SX xvec7, xvec10;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec9, xvec7;
+MOV_SX xvec7, xvec9;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec8, xvec7;
+MOV_SX xvec7, xvec8;
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+SHUF_SX $0xb1, xvec13, xvec13;
+SHUF_SX $0xb1, xvec12, xvec12;
+SHUF_SX $0xb1, xvec11, xvec11;
+SHUF_SX $0xb1, xvec10, xvec10;
+SHUF_SX $0xb1, xvec9, xvec9;
+SHUF_SX $0xb1, xvec8, xvec8;
+#endif
+#### Mulitply Alpha ####
+BROAD_SX MEMALPHA_R, xvec7;
+BROAD_SX MEMALPHA_I, xvec6;
+#### Writng back ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+VPERMILP_SX $0xb1,xvec14, xvec4;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec6, xvec4;
+ADDSUB_SX xvec4, xvec14;
+VPERMILP_SX $0xb1,xvec13, xvec3;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec6, xvec3;
+ADDSUB_SX xvec3, xvec13;
+VPERMILP_SX $0xb1,xvec12, xvec2;
+MUL_SX xvec7, xvec12;
+MUL_SX xvec6, xvec2;
+ADDSUB_SX xvec2, xvec12;
+VPERMILP_SX $0xb1,xvec11, xvec1;
+MUL_SX xvec7, xvec11;
+MUL_SX xvec6, xvec1;
+ADDSUB_SX xvec1, xvec11;
+VPERMILP_SX $0xb1,xvec10, xvec0;
+MUL_SX xvec7, xvec10;
+MUL_SX xvec6, xvec0;
+ADDSUB_SX xvec0, xvec10;
+VPERMILP_SX $0xb1,xvec9, xvec5;
+MUL_SX xvec7, xvec9;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec9;
+VPERMILP_SX $0xb1,xvec8, xvec4;
+MUL_SX xvec7, xvec8;
+MUL_SX xvec6, xvec4;
+ADDSUB_SX xvec4, xvec8;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+LDL_SX 4*SIZE(C0), xvec1;
+LDH_SX 6*SIZE(C1), xvec1;
+LDL_SX 8*SIZE(C0), xvec2;
+LDH_SX 10*SIZE(C1), xvec2;
+LDL_SX 12*SIZE(C0), xvec3;
+LDH_SX 14*SIZE(C1), xvec3;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+ADD_SX xvec2, xvec13;
+ADD_SX xvec3, xvec12;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+STL_SX xvec14, 4*SIZE(C0);
+STH_SX xvec14, 6*SIZE(C1);
+STL_SX xvec13, 8*SIZE(C0);
+STH_SX xvec13, 10*SIZE(C1);
+STL_SX xvec12, 12*SIZE(C0);
+STH_SX xvec12, 14*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C1), xvec4;
+LDH_SX 2*SIZE(C0), xvec4;
+LDL_SX 4*SIZE(C1), xvec5;
+LDH_SX 6*SIZE(C0), xvec5;
+LDL_SX 8*SIZE(C1), xvec6;
+LDH_SX 10*SIZE(C0), xvec6;
+LDL_SX 12*SIZE(C1), xvec7;
+LDH_SX 14*SIZE(C0), xvec7;
+ADD_SX xvec4, xvec11;
+ADD_SX xvec5, xvec10;
+ADD_SX xvec6, xvec9;
+ADD_SX xvec7, xvec8;
+#endif
+STL_SX xvec11, 0*SIZE(C1);
+STH_SX xvec11, 2*SIZE(C0);
+STL_SX xvec10, 4*SIZE(C1);
+STH_SX xvec10, 6*SIZE(C0);
+STL_SX xvec9, 8*SIZE(C1);
+STH_SX xvec9, 10*SIZE(C0);
+STL_SX xvec8, 12*SIZE(C1);
+STH_SX xvec8, 14*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk;
+#endif
+
+ADDQ $16*SIZE, C0;
+ADDQ $16*SIZE, C1;
+DECQ i;
+JG .L21_bodyB;
+.align 32
+.L21_loopE:
+TEST $4, bm;
+JLE .L22_loopE;
+.align 32
+.L22_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec10, yvec10, yvec10;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+
+SARQ $2, k;
+JLE .L221_loopE;
+.align 32
+.L221_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+#### Unroll 2 #####
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+#### Unroll 3 ####
+EDUP_SX 8*SIZE(ptrbb), xvec4;
+ODUP_SX 8*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 16*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 20*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+#### Unroll 4 ####
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+ODUP_SX 12*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 24*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 28*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+ADDQ $32*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L221_bodyB;
+.align 32
+.L221_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L222_loopE;
+.align 32
+.L222_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+
+#### Unroll 2 #####
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+ADDQ $16*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+
+.L222_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L223_loopE;
+.align 32
+.L223_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec14;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec10;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec14;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec10;
+ADDQ $8*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L223_loopE:
+#### Handle ####
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec10, xvec7;
+MOV_SX xvec7, xvec10;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec10, xvec7;
+MOV_SX xvec7, xvec10;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+SHUF_SX $0xb1, xvec11, xvec11;
+SHUF_SX $0xb1, xvec10, xvec10;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec14, xvec7;
+MOV_SX xvec7, xvec14;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec10, xvec7;
+MOV_SX xvec7, xvec10;
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec14, xvec14;
+SHUF_SX $0xb1, xvec11, xvec11;
+SHUF_SX $0xb1, xvec10, xvec10;
+#endif
+#### Mulitply Alpha ####
+BROAD_SX MEMALPHA_R, xvec7;
+BROAD_SX MEMALPHA_I, xvec6;
+#### Writng back ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+VPERMILP_SX $0xb1,xvec14, xvec4;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec6, xvec4;
+ADDSUB_SX xvec4, xvec14;
+VPERMILP_SX $0xb1,xvec11, xvec1;
+MUL_SX xvec7, xvec11;
+MUL_SX xvec6, xvec1;
+ADDSUB_SX xvec1, xvec11;
+VPERMILP_SX $0xb1,xvec10, xvec0;
+MUL_SX xvec7, xvec10;
+MUL_SX xvec6, xvec0;
+ADDSUB_SX xvec0, xvec10;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+LDL_SX 4*SIZE(C0), xvec1;
+LDH_SX 6*SIZE(C1), xvec1;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+STL_SX xvec14, 4*SIZE(C0);
+STH_SX xvec14, 6*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C1), xvec4;
+LDH_SX 2*SIZE(C0), xvec4;
+LDL_SX 4*SIZE(C1), xvec5;
+LDH_SX 6*SIZE(C0), xvec5;
+ADD_SX xvec4, xvec11;
+ADD_SX xvec5, xvec10;
+#endif
+STL_SX xvec11, 0*SIZE(C1);
+STH_SX xvec11, 2*SIZE(C0);
+STL_SX xvec10, 4*SIZE(C1);
+STH_SX xvec10, 6*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+
+.L22_loopE:
+TEST $2, bm;
+JLE .L23_loopE;
+.align 32
+.L23_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec11, yvec11, yvec11;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L231_loopE;
+.align 32
+.L231_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+#### Unroll 2 #####
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+#### Unroll 3 ####
+EDUP_SX 8*SIZE(ptrbb), xvec4;
+ODUP_SX 8*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+#### Unroll 4 ####
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+ODUP_SX 12*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+ADDQ $16*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L231_bodyB;
+.align 32
+.L231_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L232_loopE;
+.align 32
+.L232_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+
+#### Unroll 2 #####
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L232_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L233_loopE;
+.align 32
+.L233_bodyB:
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x4e, xvec4, xvec6;
+SHUF_SX $0x4e, xvec5, xvec7;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+MOV_SX xvec0, xvec1;
+MUL_SX xvec4, xvec0;
+ADD1_SX xvec0, xvec15;
+SHUF_SX $0xb1, xvec1, xvec2;
+MUL_SX xvec6, xvec1;
+ADD1_SX xvec1, xvec11;
+
+MOV_SX xvec2, xvec3;
+MUL_SX xvec5, xvec2;
+ADD2_SX xvec2, xvec15;
+MUL_SX xvec7, xvec3;
+ADD2_SX xvec3, xvec11;
+ADDQ $4*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L233_loopE:
+#### Handle ####
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+SUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec11, xvec11;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+XOR_SY yvec7, yvec7, yvec7;
+ADDSUB_SX xvec11, xvec7;
+MOV_SX xvec7, xvec11;
+SHUF_SX $0xb1, xvec15, xvec15;
+SHUF_SX $0xb1, xvec11, xvec11;
+#endif
+#### Mulitply Alpha ####
+BROAD_SX MEMALPHA_R, xvec7;
+BROAD_SX MEMALPHA_I, xvec6;
+#### Writng back ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+VPERMILP_SX $0xb1,xvec11, xvec1;
+MUL_SX xvec7, xvec11;
+MUL_SX xvec6, xvec1;
+ADDSUB_SX xvec1, xvec11;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+ADD_SX xvec0, xvec15;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C1), xvec4;
+LDH_SX 2*SIZE(C0), xvec4;
+ADD_SX xvec4, xvec11;
+#endif
+STL_SX xvec11, 0*SIZE(C1);
+STH_SX xvec11, 2*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+
+.L23_loopE:
+TEST $1, bm;
+JLE .L24_loopE;
+.align 32
+.L24_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L241_loopE;
+.align 32
+.L241_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+BROAD_SX 2*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 3*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+BROAD_SX 4*SIZE(ptrba), xvec0;
+LD_SX 8*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 5*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+BROAD_SX 6*SIZE(ptrba), xvec0;
+LD_SX 12*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 7*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L241_bodyB;
+.align
+.L241_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L242_loopE;
+.align 32
+.L242_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+BROAD_SX 2*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 3*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L242_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L243_loopE;
+.align 32
+.L243_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xb1, xvec2, xvec3;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L243_loopE:
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+SHUF_SX $0xb1, xvec15, xvec15;
+#endif
+##### Load Alpha ####
+BROAD_SX MEMALPHA_R,xvec7;
+BROAD_SX MEMALPHA_I,xvec6;
+##### Multiply Alpha ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+#### Writing back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 0*SIZE(C1), xvec0;
+ADD_SX xvec0, xvec15;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 0*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+.L24_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $2, kk;
+#endif
+MOVQ bk, k;
+SALQ $4, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 2), C;
+.L20_loopE:
+TEST $1, bn;
+JLE .L30_loopE;
+.align 32
+.L30_bodyB:
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C, C0;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i;
+JLE .L31_loopE;
+.align 32
+.L31_bodyB:
+MOVQ bb, ptrbb;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+MOVQ bk, k;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L311_loopE;
+.align 32
+.L311_bodyB:
+#### Unroll 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+LD_SY 8*SIZE(ptrba), yvec1;
+BROAD_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 1*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+
+#### Unroll 2 ####
+LD_SY 16*SIZE(ptrba), yvec0;
+LD_SY 24*SIZE(ptrba), yvec1;
+BROAD_SY 2*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 3*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+
+#### Unroll 3 ####
+LD_SY 32*SIZE(ptrba), yvec0;
+LD_SY 40*SIZE(ptrba), yvec1;
+BROAD_SY 4*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 5*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+
+#### Unroll 4 ####
+LD_SY 48*SIZE(ptrba), yvec0;
+LD_SY 56*SIZE(ptrba), yvec1;
+BROAD_SY 6*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 7*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+ADDQ $64*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L311_bodyB;
+.align 32
+.L311_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L312_loopE;
+.align 32
+.L312_bodyB:
+#### Unroll 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+LD_SY 8*SIZE(ptrba), yvec1;
+BROAD_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 1*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+
+#### Unroll 2 ####
+LD_SY 16*SIZE(ptrba), yvec0;
+LD_SY 24*SIZE(ptrba), yvec1;
+BROAD_SY 2*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 3*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+ADDQ $32*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L312_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L313_loopE;
+.align 32
+.L313_bodyB:
+#### Unroll 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+LD_SY 8*SIZE(ptrba), yvec1;
+BROAD_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+MUL_SY yvec1, yvec2, yvec7;
+ADD1_SY yvec7, yvec14, yvec14;
+
+BROAD_SY 1*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+VPERMILP_SY $0xb1, yvec1, yvec5;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+MUL_SY yvec5, yvec3, yvec7;
+ADD2_SY yvec7, yvec14, yvec14;
+ADDQ $16*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L313_loopE:
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_SY yvec15, yvec7, yvec15;
+ADDSUB_SY yvec14, yvec7, yvec14;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_SY yvec15, yvec7, yvec15;
+SUB_SY yvec14, yvec7, yvec14;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_SY $0xb1, yvec15, yvec15;
+VPERMILP_SY $0xb1, yvec14, yvec14;
+ADDSUB_SY yvec15, yvec7, yvec15;
+ADDSUB_SY yvec14, yvec7, yvec14;
+VPERMILP_SY $0xb1, yvec15, yvec15;
+VPERMILP_SY $0xb1, yvec14, yvec14;
+#endif
+##### Load Alpha ####
+BROAD_SY MEMALPHA_R,yvec7;
+BROAD_SY MEMALPHA_I,yvec6;
+##### Multiply Alpha ####
+VPERMILP_SY $0xb1,yvec15, yvec5;
+MUL_SY yvec15, yvec7, yvec15;
+MUL_SY yvec5, yvec6, yvec5;
+ADDSUB_SY yvec5, yvec15, yvec15;
+VPERMILP_SY $0xb1,yvec14, yvec4;
+MUL_SY yvec14, yvec7, yvec14;
+MUL_SY yvec4, yvec6, yvec4;
+ADDSUB_SY yvec4, yvec14, yvec14;
+#### Writing back ####
+EXTRA_SY $1, yvec15, xvec7;
+EXTRA_SY $1, yvec14, xvec6;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C0), xvec0;
+LDL_SX 4*SIZE(C0), xvec1;
+LDH_SX 6*SIZE(C0), xvec1;
+LDL_SX 8*SIZE(C0), xvec2;
+LDH_SX 10*SIZE(C0), xvec2;
+LDL_SX 12*SIZE(C0), xvec3;
+LDH_SX 14*SIZE(C0), xvec3;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec7;
+ADD_SX xvec2, xvec14;
+ADD_SX xvec3, xvec6;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0);
+STL_SX xvec7, 4*SIZE(C0);
+STH_SX xvec7, 6*SIZE(C0);
+STL_SX xvec14, 8*SIZE(C0);
+STH_SX xvec14, 10*SIZE(C0);
+STL_SX xvec6, 12*SIZE(C0);
+STH_SX xvec6, 14*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk;
+#endif
+ADDQ $16*SIZE, C0;
+DECQ i;
+JG .L31_bodyB;
+.align 32
+.L31_loopE:
+TEST $4, bm;
+JLE .L32_loopE;
+.align 32
+.L32_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L321_loopE;
+.align 32
+.L321_bodyB:
+#### Unroll 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+BROAD_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 1*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+
+#### Unroll 2 ####
+LD_SY 8*SIZE(ptrba), yvec0;
+BROAD_SY 2*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 3*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+
+#### Unroll 3 ####
+LD_SY 16*SIZE(ptrba), yvec0;
+BROAD_SY 4*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 5*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+
+#### Unroll 4 ####
+LD_SY 24*SIZE(ptrba), yvec0;
+BROAD_SY 6*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 7*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+ADDQ $32*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L321_bodyB;
+.align 32
+.L321_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L322_loopE;
+.align 32
+.L322_bodyB:
+#### Unroll 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+BROAD_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 1*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+
+#### Unroll 2 ####
+LD_SY 8*SIZE(ptrba), yvec0;
+BROAD_SY 2*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 3*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+ADDQ $16*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L322_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L323_loopE;
+.align 32
+.L323_bodyB:
+#### Unroll 1 ####
+LD_SY 0*SIZE(ptrba), yvec0;
+BROAD_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec2, yvec6;
+ADD1_SY yvec6, yvec15, yvec15;
+
+BROAD_SY 1*SIZE(ptrbb), yvec3;
+VPERMILP_SY $0xb1, yvec0, yvec4;
+MUL_SY yvec4, yvec3, yvec6;
+ADD2_SY yvec6, yvec15, yvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L323_loopE:
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_SY yvec15, yvec7, yvec15;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_SY yvec15, yvec7, yvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_SY $0xb1, yvec15, yvec15;
+ADDSUB_SY yvec15, yvec7, yvec15;
+VPERMILP_SY $0xb1, yvec15, yvec15;
+#endif
+##### Load Alpha ####
+BROAD_SY MEMALPHA_R,yvec7;
+BROAD_SY MEMALPHA_I,yvec6;
+##### Multiply Alpha ####
+VPERMILP_SY $0xb1,yvec15, yvec5;
+MUL_SY yvec15, yvec7, yvec15;
+MUL_SY yvec5, yvec6, yvec5;
+ADDSUB_SY yvec5, yvec15, yvec15;
+#### Writing back ####
+EXTRA_SY $1, yvec15, xvec7;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C0), xvec0;
+LDL_SX 4*SIZE(C0), xvec1;
+LDH_SX 6*SIZE(C0), xvec1;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec7;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0);
+STL_SX xvec7, 4*SIZE(C0);
+STH_SX xvec7, 6*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+ADDQ $8*SIZE, C0;
+
+.L32_loopE:
+TEST $2, bm;
+JLE .L33_loopE;
+.align 32
+.L33_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L331_loopE;
+.align 32
+.L331_bodyB:
+#### Unroll 1 ####
+LD_SX 0*SIZE(ptrba), xvec0;
+BROAD_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 1*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+#### Unroll 2 ####
+LD_SX 4*SIZE(ptrba), xvec0;
+BROAD_SX 2*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 3*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+#### Unroll 3 ####
+LD_SX 8*SIZE(ptrba), xvec0;
+BROAD_SX 4*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 5*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+#### Unroll 4 ####
+LD_SX 12*SIZE(ptrba), xvec0;
+BROAD_SX 6*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 7*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+ADDQ $16*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L331_bodyB;
+.align 32
+.L331_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L332_loopE;
+.align 32
+.L332_bodyB:
+#### Unroll 1 ####
+LD_SX 0*SIZE(ptrba), xvec0;
+BROAD_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 1*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+
+#### Unroll 2 ####
+LD_SX 4*SIZE(ptrba), xvec0;
+BROAD_SX 2*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 3*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L332_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L333_loopE;
+.align 32
+.L333_bodyB:
+#### Unroll 1 ####
+LD_SX 0*SIZE(ptrba), xvec0;
+BROAD_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD1_SX xvec2, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+BROAD_SX 1*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec3;
+ADD2_SX xvec3, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L333_loopE:
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+SHUF_SX $0xb1, xvec15, xvec15;
+#endif
+#### Mulitply Alpha ####
+BROAD_SX MEMALPHA_R, xvec7;
+BROAD_SX MEMALPHA_I, xvec6;
+#### Writng back ####
+VPERMILP_SX $0xb1,xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C0), xvec0;
+ADD_SX xvec0, xvec15;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $4*SIZE, C0;
+
+.L33_loopE:
+TEST $1, bm;
+JLE .L34_loopE;
+.align 32
+.L34_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L341_loopE;
+.align 32
+.L341_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xa0, xvec2, xvec3;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+SHUF_SX $0xf5, xvec2, xvec4;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec15;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrbb), xvec2;
+SHUF_SX $0xa0, xvec2, xvec3;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+SHUF_SX $0xf5, xvec2, xvec4;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L341_bodyB;
+.align 32
+.L341_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L342_loopE;
+.align 32
+.L342_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xa0, xvec2, xvec3;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec15;
+SHUF_SX $0xb1, xvec0, xvec1;
+SHUF_SX $0xf5, xvec2, xvec4;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L342_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L343_loopE;
+.align 32
+.L343_bodyB:
+XOR_SY yvec0, yvec0, yvec0;
+XOR_SY yvec2, yvec2, yvec2;
+LDL_SX 0*SIZE(ptrba), xvec0;
+LDL_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0xe0, xvec2, xvec3;
+MUL_SX xvec0, xvec3;
+ADD1_SX xvec3, xvec15;
+SHUF_SX $0xe1, xvec0, xvec1;
+SHUF_SX $0xe5, xvec2, xvec4;
+MUL_SX xvec1, xvec4;
+ADD2_SX xvec4, xvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L343_loopE:
+#### Handle ####
+XOR_SY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_SX $0xb1, xvec15, xvec15;
+ADDSUB_SX xvec15, xvec7;
+MOV_SX xvec7, xvec15;
+SHUF_SX $0xb1, xvec15, xvec15;
+#endif
+BROAD_SX MEMALPHA_R, xvec7;
+BROAD_SX MEMALPHA_I, xvec6;
+VPERMILP_SX $0xb1, xvec15, xvec5;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec6, xvec5;
+ADDSUB_SX xvec5, xvec15;
+SHUF_SX $0x44, xvec15, xvec14;
+SHUF_SX $0xee, xvec15, xvec13;
+ADD_SX xvec13, xvec14;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+ADD_SX xvec0, xvec14;
+#endif
+STL_SX xvec14, 0*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+ADDQ $2*SIZE, C0;
+
+.L34_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $1, kk;
+#endif
+
+MOVQ bk, k;
+SALQ $3, k;
+ADDQ k, bb;
+ADDQ ldc, C;
+.L30_loopE:
+movq 0(%rsp), %rbx;
+movq 8(%rsp), %rbp;
+movq 16(%rsp), %r12;
+movq 24(%rsp), %r13;
+movq 32(%rsp), %r14;
+movq 40(%rsp), %r15;
+
+#ifdef WINDOWS_ABI
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ movups 64(%rsp), %xmm6
+ movups 80(%rsp), %xmm7
+ movups 96(%rsp), %xmm8
+ movups 112(%rsp), %xmm9
+ movups 128(%rsp), %xmm10
+ movups 144(%rsp), %xmm11
+ movups 160(%rsp), %xmm12
+ movups 176(%rsp), %xmm13
+ movups 192(%rsp), %xmm14
+ movups 208(%rsp), %xmm15
+#endif
+
+addq $STACKSIZE, %rsp;
+ret
+
+EPILOGUE
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define old_bm %rdi
+#define old_bn %rsi
+#define old_bk %rdx
+
+#define bm %r13
+#define bn %r14
+#define bk %r15
+
+#define ALPHA %xmm0
+#define ba %rcx
+#define bb %r8
+#define C %r9
+#define ldc %r10
+
+#define i %r11
+#define k %rax
+
+#define ptrba %rdi
+#define ptrbb %rsi
+#define C0 %rbx
+#define C1 %rbp
+
+#define prebb %r12
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 128
+
+#define old_ldc 8+STACKSIZE(%rsp)
+#define old_offset 16+STACKSIZE(%rsp)
+#define MEMALPHA 48(%rsp)
+#define j 56(%rsp)
+#define OFFSET 64(%rsp)
+#define kk 72(%rsp)
+#define kkk 80(%rsp)
+
+#else
+
+#define STACKSIZE 512
+
+#define OLD_A 40 + STACKSIZE(%rsp)
+#define OLD_B 48 + STACKSIZE(%rsp)
+#define OLD_C 56 + STACKSIZE(%rsp)
+#define old_ldc 64 + STACKSIZE(%rsp)
+#define old_offset 72 + STACKSIZE(%rsp)
+
+#define MEMALPHA 224(%rsp)
+#define j 232(%rsp)
+#define OFFSET 240(%rsp)
+#define kk 248(%rsp)
+#define kkk 256(%rsp)
+
+#endif
+
+#define PREFETCH0 prefetcht0
+#define PREFETCH1 prefetcht0
+#define PREFETCH2 prefetcht2
+
+#define xvec0 %xmm0
+#define xvec1 %xmm1
+#define xvec2 %xmm2
+#define xvec3 %xmm3
+#define xvec4 %xmm4
+#define xvec5 %xmm5
+#define xvec6 %xmm6
+#define xvec7 %xmm7
+#define xvec8 %xmm8
+#define xvec9 %xmm9
+#define xvec10 %xmm10
+#define xvec11 %xmm11
+#define xvec12 %xmm12
+#define xvec13 %xmm13
+#define xvec14 %xmm14
+#define xvec15 %xmm15
+
+#define yvec0 %ymm0
+#define yvec1 %ymm1
+#define yvec2 %ymm2
+#define yvec3 %ymm3
+#define yvec4 %ymm4
+#define yvec5 %ymm5
+#define yvec6 %ymm6
+#define yvec7 %ymm7
+#define yvec8 %ymm8
+#define yvec9 %ymm9
+#define yvec10 %ymm10
+#define yvec11 %ymm11
+#define yvec12 %ymm12
+#define yvec13 %ymm13
+#define yvec14 %ymm14
+#define yvec15 %ymm15
+
+#define LEAQ leaq
+#define ADDQ addq
+#define MULQ imulq
+#define SARQ sarq
+#define SALQ salq
+#define ANDQ andq
+#define SUBQ subq
+#define DECQ decq
+#define JG jg
+#define JLE jle
+#define TEST testq
+#define OR orq
+#define JNE jne
+#define NOP
+#define XOR xorpd
+#define MOVQ movq
+
+#define XOR_SY vxorps
+#define XOR_DY vxorpd
+#define XOR_SX xorps
+#define XOR_DX xorpd
+
+#define LD_SY vmovaps
+#define LD_DY vmovapd
+#define LD_SX movaps
+#define LD_DX movapd
+#define LDL_DX movlpd
+#define LDL_DY vmovlpd
+#define LDH_DX movhpd
+#define LDH_DY vmovhpd
+
+#define ST_SY vmovaps
+#define ST_DY vmovapd
+#define ST_SX movaps
+#define ST_DX movapd
+#define STL_DX movlpd
+#define STH_DX movhpd
+
+#define EDUP_SY vmovsldup
+#define ODUP_SY vmovshdup
+#define EDUP_DY vmovddup
+
+#define ADD_SY vaddps
+#define ADD_DY vaddpd
+#define ADD_SX addps
+#define ADD_DX addpd
+
+#define ADD1_DY vaddpd
+#define ADD2_DY vaddpd
+#define ADDSUB_DY vaddsubpd
+#define ADDSUB_SY vaddsubps
+
+#define MUL_SY vmulps
+#define MUL_DY vmulpd
+#define MUL_SX mulps
+#define MUL_DX mulpd
+
+#define SHUF_SY vperm2f128
+#define SHUF_DY vperm2f128
+#define SHUF_DX pshufd
+
+#define VPERMILP_SY vpermilps
+#define VPERMILP_DY vpermilpd
+
+#define BROAD_SY vbroadcastss
+#define BROAD_DY vbroadcastsd
+#define BROAD_SX
+#define BROAD_DX movddup
+
+#define MOV_SY vmovaps
+#define MOV_DY vmovapd
+#define MOV_SX movaps
+#define MOV_DX movapd
+
+#define REVS_SY vshufps
+#define REVS_DY vshufpd
+#define REVS_SX shufps
+#define REVS_DX movsd
+
+#define EXTRA_SY vextractf128
+#define EXTRA_DY vextractf128
+
+PROLOGUE
+
+subq $STACKSIZE, %rsp;
+movq %rbx, 0(%rsp);
+movq %rbp, 8(%rsp);
+movq %r12, 16(%rsp);
+movq %r13, 24(%rsp);
+movq %r14, 32(%rsp);
+movq %r15, 40(%rsp);
+
+#ifdef WINDOWS_ABI
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ movups %xmm6, 64(%rsp)
+ movups %xmm7, 80(%rsp)
+ movups %xmm8, 96(%rsp)
+ movups %xmm9, 112(%rsp)
+ movups %xmm10, 128(%rsp)
+ movups %xmm11, 144(%rsp)
+ movups %xmm12, 160(%rsp)
+ movups %xmm13, 176(%rsp)
+ movups %xmm14, 192(%rsp)
+ movups %xmm15, 208(%rsp)
+
+ movq ARG1, old_bm
+ movq ARG2, old_bn
+ movq ARG3, old_bk
+ movq OLD_A, ba
+ movq OLD_B, bb
+ movq OLD_C, C
+ movq old_ldc, ldc
+#ifdef TRMMKERNEL
+ movq old_offset, %r11
+#endif
+#else
+
+movq old_ldc, ldc
+#ifdef TRMMKERNEL
+movq old_offset, %r11
+#endif
+#endif
+
+vmovlps ALPHA, MEMALPHA
+movq old_bm, bm
+movq old_bn, bn
+movq old_bk, bk
+leaq (, ldc, SIZE), ldc
+#ifdef TRMMKERNEL
+movq %r11, OFFSET
+#ifndef LEFT
+negq %r11;
+#endif
+movq %r11, kk
+#endif
+
+MOVQ bn,j;
+SARQ $2,j; # Rn = 4
+JLE .L0_loopE;
+.align 32;
+.L0_bodyB:;
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+
+MOVQ C,C0;
+LEAQ (C,ldc,2),C1;
+MOVQ bk, k;
+SALQ $5, k;
+LEAQ (bb, k, 1), prebb;
+MOVQ ba,ptrba;
+MOVQ bm,i;
+SARQ $3,i; # Rm = 8
+JLE .L1_loopE;
+.align 32;
+.L1_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial Results Register ####
+PREFETCH2 0*SIZE(prebb);
+XOR_DY yvec15, yvec15, yvec15;
+PREFETCH2 8*SIZE(prebb);
+XOR_DY yvec14, yvec14, yvec14;
+XOR_DY yvec13, yvec13, yvec13;
+ADDQ $16*SIZE, prebb
+XOR_DY yvec12, yvec12, yvec12;
+PREFETCH0 3*SIZE(C0)
+LD_DY 0*SIZE(ptrbb), yvec2;
+PREFETCH0 3*SIZE(C0, ldc, 1)
+XOR_DY yvec11, yvec11, yvec11;
+PREFETCH0 3*SIZE(C1)
+XOR_DY yvec10, yvec10, yvec10;
+PREFETCH0 3*SIZE(C1, ldc, 1)
+LD_DY 0*SIZE(ptrba), yvec0;
+XOR_DY yvec9, yvec9, yvec9;
+XOR_DY yvec8, yvec8, yvec8;
+VPERMILP_DY $0x05, yvec2, yvec3;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2,k;
+JLE .L2_loopE;
+.align 32;
+.L2_bodyB:;
+# Computing kernel
+
+#### Unroll times 1 ####
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+PREFETCH0 64*SIZE(ptrba)
+MUL_DY yvec1, yvec2, yvec6;
+LD_DY 4*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 8*SIZE(ptrba), yvec0;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+
+#### Unroll times 2 ####
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+PREFETCH0 72*SIZE(ptrba)
+MUL_DY yvec1, yvec2, yvec6;
+LD_DY 8*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 16*SIZE(ptrba), yvec0;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+
+#### Unroll times 3 ####
+LD_DY 20*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+PREFETCH0 80*SIZE(ptrba)
+MUL_DY yvec1, yvec2, yvec6;
+LD_DY 12*SIZE(ptrbb), yvec2;
+ADDQ $16*SIZE, ptrbb;
+MUL_DY yvec1, yvec3, yvec7;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 24*SIZE(ptrba), yvec0;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+
+#### Unroll times 4 ####
+LD_DY 28*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADDQ $32*SIZE, ptrba;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+PREFETCH0 88*SIZE(ptrba)
+MUL_DY yvec1, yvec2, yvec6;
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 0*SIZE(ptrba), yvec0;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+.L2_bodyE:;
+DECQ k;
+JG .L2_bodyB;
+.align 64;
+.L2_loopE:;
+PREFETCH2 0*SIZE(prebb);
+ADDQ $8*SIZE, prebb;
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L3_loopE;
+.align 64
+.L3_bodyB:
+#### Unroll times 1 ####
+PREFETCH0 64*SIZE(ptrba)
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+MUL_DY yvec1, yvec2, yvec6;
+LD_DY 4*SIZE(ptrbb), yvec2;
+ADDQ $8*SIZE, ptrbb;
+MUL_DY yvec1, yvec3, yvec7;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 8*SIZE(ptrba), yvec0;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+
+#### Unroll times 2 ####
+PREFETCH0 72*SIZE(ptrba)
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADDQ $16*SIZE, ptrba;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+MUL_DY yvec1, yvec2, yvec6;
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 0*SIZE(ptrba), yvec0;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+
+.L3_loopE:
+PREFETCH2 0*SIZE(prebb);
+ADDQ $8*SIZE, prebb
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L4_loopE;
+.align 64
+.L4_bodyB:;
+#### Unroll times 1 ####
+PREFETCH0 64*SIZE(ptrba)
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADDQ $8*SIZE, ptrba;
+ADD_DY yvec15, yvec6, yvec15;
+ADD_DY yvec13, yvec7, yvec13;
+
+MUL_DY yvec1, yvec2, yvec6;
+MUL_DY yvec1, yvec3, yvec7;
+ADDQ $4*SIZE, ptrbb;
+ADD_DY yvec14, yvec6, yvec14;
+ADD_DY yvec12, yvec7, yvec12;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+ADD_DY yvec11, yvec6, yvec11;
+ADD_DY yvec9, yvec7, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec10, yvec6, yvec10;
+ADD_DY yvec8, yvec7, yvec8;
+
+.L4_loopE:;
+#### Load Alpha ####
+BROAD_DY MEMALPHA,yvec7;
+#### Multiply Alpha ####
+MUL_DY yvec7,yvec15,yvec15;
+MUL_DY yvec7,yvec14,yvec14;
+MUL_DY yvec7,yvec13,yvec13;
+MUL_DY yvec7,yvec12,yvec12;
+MUL_DY yvec7,yvec11,yvec11;
+MUL_DY yvec7,yvec10,yvec10;
+MUL_DY yvec7,yvec9,yvec9;
+MUL_DY yvec7,yvec8,yvec8;
+#### Reverse the Results ####
+MOV_DY yvec15,yvec7;
+REVS_DY $0x0a,yvec13,yvec15,yvec15;
+REVS_DY $0x0a,yvec7,yvec13,yvec13;
+MOV_DY yvec14,yvec7;
+REVS_DY $0x0a,yvec12,yvec14,yvec14;
+REVS_DY $0x0a,yvec7,yvec12,yvec12;
+MOV_DY yvec11,yvec7;
+REVS_DY $0x0a,yvec9,yvec11,yvec11;
+REVS_DY $0x0a,yvec7,yvec9,yvec9;
+MOV_DY yvec10,yvec7;
+REVS_DY $0x0a,yvec8,yvec10,yvec10;
+REVS_DY $0x0a,yvec7,yvec8,yvec8;
+#### Testing alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L4_loopEx; # Unalign part write back
+.align 32
+#### Writing Back ####
+EXTRA_DY $1,yvec15,xvec7;
+EXTRA_DY $1,yvec14,xvec6;
+EXTRA_DY $1,yvec13,xvec5;
+EXTRA_DY $1,yvec12,xvec4;
+EXTRA_DY $1,yvec11,xvec3;
+EXTRA_DY $1,yvec10,xvec2;
+EXTRA_DY $1,yvec9,xvec1;
+EXTRA_DY $1,yvec8,xvec0;
+#ifndef TRMMKERNEL
+ADD_DY 0*SIZE(C0),xvec15,xvec15;
+ADD_DY 2*SIZE(C1),xvec7,xvec7;
+ADD_DY 4*SIZE(C0),xvec14,xvec14;
+ADD_DY 6*SIZE(C1),xvec6,xvec6;
+ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13;
+ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5;
+ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12;
+ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4;
+ADD_DY 0*SIZE(C1),xvec11,xvec11;
+ADD_DY 2*SIZE(C0),xvec3,xvec3;
+ADD_DY 4*SIZE(C1),xvec10,xvec10;
+ADD_DY 6*SIZE(C0),xvec2,xvec2;
+ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9;
+ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1;
+ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8;
+ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0;
+#endif
+ST_DY xvec15, 0*SIZE(C0);
+ST_DY xvec7, 2*SIZE(C1);
+ST_DY xvec14, 4*SIZE(C0);
+ST_DY xvec6, 6*SIZE(C1);
+ST_DY xvec13, 0*SIZE(C0,ldc,1);
+ST_DY xvec5, 2*SIZE(C1,ldc,1);
+ST_DY xvec12, 4*SIZE(C0,ldc,1);
+ST_DY xvec4, 6*SIZE(C1,ldc,1);
+ST_DY xvec11, 0*SIZE(C1);
+ST_DY xvec3, 2*SIZE(C0);
+ST_DY xvec10, 4*SIZE(C1);
+ST_DY xvec2, 6*SIZE(C0);
+ST_DY xvec9, 0*SIZE(C1,ldc,1);
+ST_DY xvec1, 2*SIZE(C0,ldc,1);
+ST_DY xvec8, 4*SIZE(C1,ldc,1);
+ST_DY xvec0, 6*SIZE(C0,ldc,1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE,C0;
+ADDQ $8*SIZE,C1;
+.L1_bodyE:;
+DECQ i;
+JG .L1_bodyB;
+JMP .L1_loopE;
+.align 32;
+.L4_loopEx:;
+EXTRA_DY $1, yvec15, xvec7;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C0), xvec6, xvec6;
+LDH_DY 1*SIZE(C0), xvec6, xvec6;
+ADD_DY xvec6, xvec15, xvec15;
+LDL_DY 2*SIZE(C1), xvec5, xvec5;
+LDH_DY 3*SIZE(C1), xvec5, xvec5;
+ADD_DY xvec5, xvec7, xvec7;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 2*SIZE(C1);
+STH_DX xvec7, 3*SIZE(C1);
+
+EXTRA_DY $1, yvec14, xvec4;
+#ifndef TRMMKERNEL
+LDL_DY 4*SIZE(C0), xvec3, xvec3;
+LDH_DY 5*SIZE(C0), xvec3, xvec3;
+ADD_DY xvec3, xvec14, xvec14;
+LDL_DY 6*SIZE(C1), xvec2, xvec2;
+LDH_DY 7*SIZE(C1), xvec2, xvec2;
+ADD_DY xvec2, xvec4, xvec4;
+#endif
+STL_DX xvec14, 4*SIZE(C0);
+STH_DX xvec14, 5*SIZE(C0);
+STL_DX xvec4, 6*SIZE(C1);
+STH_DX xvec4, 7*SIZE(C1);
+
+EXTRA_DY $1, yvec13, xvec7;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C0, ldc, 1), xvec6, xvec6;
+LDH_DY 1*SIZE(C0, ldc, 1), xvec6, xvec6;
+ADD_DY xvec6, xvec13, xvec13;
+LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5;
+LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5;
+ADD_DY xvec5, xvec7, xvec7;
+#endif
+STL_DX xvec13, 0*SIZE(C0, ldc, 1);
+STH_DX xvec13, 1*SIZE(C0, ldc, 1);
+STL_DX xvec7, 2*SIZE(C1, ldc, 1);
+STH_DX xvec7, 3*SIZE(C1, ldc, 1);
+
+EXTRA_DY $1, yvec12, xvec4;
+#ifndef TRMMKERNEL
+LDL_DY 4*SIZE(C0, ldc, 1), xvec3, xvec3;
+LDH_DY 5*SIZE(C0, ldc, 1), xvec3, xvec3;
+ADD_DY xvec3, xvec12, xvec12;
+LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2;
+LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2;
+ADD_DY xvec2, xvec4, xvec4;
+#endif
+STL_DX xvec12, 4*SIZE(C0, ldc, 1);
+STH_DX xvec12, 5*SIZE(C0, ldc ,1);
+STL_DX xvec4, 6*SIZE(C1, ldc, 1);
+STH_DX xvec4, 7*SIZE(C1, ldc, 1);
+
+EXTRA_DY $1, yvec11, xvec7;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C1), xvec6, xvec6;
+LDH_DY 1*SIZE(C1), xvec6, xvec6;
+ADD_DY xvec6, xvec11, xvec11;
+LDL_DY 2*SIZE(C0), xvec5, xvec5;
+LDH_DY 3*SIZE(C0), xvec5, xvec5;
+ADD_DY xvec5, xvec7, xvec7;
+#endif
+STL_DX xvec11, 0*SIZE(C1);
+STH_DX xvec11, 1*SIZE(C1);
+STL_DX xvec7, 2*SIZE(C0);
+STH_DX xvec7, 3*SIZE(C0);
+
+EXTRA_DY $1, yvec10, xvec4;
+#ifndef TRMMKERNEL
+LDL_DY 4*SIZE(C1), xvec3, xvec3;
+LDH_DY 5*SIZE(C1), xvec3, xvec3;
+ADD_DY xvec3, xvec10, xvec10;
+LDL_DY 6*SIZE(C0), xvec2, xvec2;
+LDH_DY 7*SIZE(C0), xvec2, xvec2;
+ADD_DY xvec2, xvec4, xvec4;
+#endif
+STL_DX xvec10, 4*SIZE(C1);
+STH_DX xvec10, 5*SIZE(C1);
+STL_DX xvec4, 6*SIZE(C0);
+STH_DX xvec4, 7*SIZE(C0);
+
+EXTRA_DY $1, yvec9, xvec7;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C1, ldc, 1), xvec6, xvec6;
+LDH_DY 1*SIZE(C1, ldc, 1), xvec6, xvec6;
+ADD_DY xvec6, xvec9, xvec9;
+LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5;
+LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5;
+ADD_DY xvec5, xvec7, xvec7;
+#endif
+STL_DX xvec9, 0*SIZE(C1, ldc, 1);
+STH_DX xvec9, 1*SIZE(C1, ldc, 1);
+STL_DX xvec7, 2*SIZE(C0, ldc, 1);
+STH_DX xvec7, 3*SIZE(C0, ldc, 1);
+
+EXTRA_DY $1, yvec8, xvec4;
+#ifndef TRMMKERNEL
+LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3;
+LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3;
+ADD_DY xvec3, xvec8, xvec8;
+LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2;
+LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2;
+ADD_DY xvec2, xvec4, xvec4;
+#endif
+STL_DX xvec8, 4*SIZE(C1, ldc, 1);
+STH_DX xvec8, 5*SIZE(C1, ldc, 1);
+STL_DX xvec4, 6*SIZE(C0, ldc, 1);
+STH_DX xvec4, 7*SIZE(C0, ldc, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L1_bodyB;
+.align 32
+.L1_loopE:;
+TEST $4, bm; # Rm = 4
+JLE .L5_loopE;
+.align 32
+.L5_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec13, yvec13, yvec13;
+LD_DY 0*SIZE(ptrbb), yvec2;
+XOR_DY yvec11, yvec11, yvec11;
+XOR_DY yvec9, yvec9, yvec9;
+LD_DY 0*SIZE(ptrba), yvec0;
+VPERMILP_DY $0x05, yvec2, yvec3;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L6_loopE;
+.align 32;
+.L6_bodyB:;
+# Computing kernel
+
+#### Untoll time 1 ####
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+LD_DY 4*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+VPERMILP_DY $0x05, yvec2, yvec3;
+MUL_DY yvec0, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+
+#### Untoll time 2 ####
+LD_DY 8*SIZE(ptrba), yvec0;
+MUL_DY yvec1, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec1, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+LD_DY 8*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+VPERMILP_DY $0x05, yvec2, yvec3;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+
+#### Untoll time 3 ####
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+ADDQ $16*SIZE, ptrba;
+MUL_DY yvec0, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+LD_DY 12*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADDQ $16*SIZE, ptrbb;
+MUL_DY yvec0, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+
+#### Untoll time 4 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+MUL_DY yvec1, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec1, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+VPERMILP_DY $0x05, yvec2, yvec3;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+DECQ k;
+JG .L6_bodyB;
+.align 32
+.L6_loopE:;
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L7_loopE;
+.align 32
+.L7_bodyB:;
+#### Untoll time 1 ####
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+ADDQ $8*SIZE, ptrba;
+MUL_DY yvec0, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+LD_DY 4*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+VPERMILP_DY $0x05, yvec2, yvec3;
+ADDQ $8*SIZE, ptrbb;
+MUL_DY yvec0, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+
+#### Untoll time 2 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+MUL_DY yvec1, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec1, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+VPERMILP_DY $0x05, yvec2, yvec3;
+MUL_DY yvec1, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+
+.L7_loopE:;
+#ifndef TRMMKERNEL
+TEST $1, bk
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L8_loopE;
+.align 32
+.L8_bodyB:;
+#### Untoll time 1 ####
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+ADDQ $4*SIZE, ptrba;
+MUL_DY yvec0, yvec3, yvec7;
+ADD_DY yvec13, yvec7, yvec13;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD_DY yvec11, yvec6, yvec11;
+ADDQ $4*SIZE, ptrbb;
+MUL_DY yvec0, yvec5, yvec7;
+ADD_DY yvec9, yvec7, yvec9;
+
+.L8_loopE:;
+#### Load Alpha ####
+BROAD_DY MEMALPHA, yvec7;
+#### Multiply Alpha ####
+MUL_DY yvec7,yvec15,yvec15;
+MUL_DY yvec7,yvec13,yvec13;
+MUL_DY yvec7,yvec11,yvec11;
+MUL_DY yvec7,yvec9,yvec9;
+#### Reverse the Results ####
+MOV_DY yvec15, yvec7;
+REVS_DY $0x0a,yvec13,yvec15,yvec15;
+REVS_DY $0x0a,yvec7,yvec13,yvec13;
+MOV_DY yvec11,yvec7;
+REVS_DY $0x0a,yvec9,yvec11,yvec11;
+REVS_DY $0x0a,yvec7,yvec9,yvec9;
+#### Testing alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L8_loopEx; # Unalign part write back
+.align 32
+#### Writing Back ####
+EXTRA_DY $1,yvec15,xvec7;
+EXTRA_DY $1,yvec13,xvec5;
+EXTRA_DY $1,yvec11,xvec3;
+EXTRA_DY $1,yvec9,xvec1;
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0),xvec15;
+ADD_DX 2*SIZE(C1),xvec7;
+ADD_DX 0*SIZE(C0,ldc,1),xvec13;
+ADD_DX 2*SIZE(C1,ldc,1),xvec5;
+ADD_DX 0*SIZE(C1),xvec11;
+ADD_DX 2*SIZE(C0),xvec3;
+ADD_DX 0*SIZE(C1,ldc,1),xvec9;
+ADD_DX 2*SIZE(C0,ldc,1),xvec1;
+#endif
+ST_DX xvec15, 0*SIZE(C0);
+ST_DX xvec7, 2*SIZE(C1);
+ST_DX xvec13, 0*SIZE(C0,ldc,1);
+ST_DX xvec5, 2*SIZE(C1,ldc,1);
+ST_DX xvec11, 0*SIZE(C1);
+ST_DX xvec3, 2*SIZE(C0);
+ST_DX xvec9, 0*SIZE(C1,ldc,1);
+ST_DX xvec1, 2*SIZE(C0,ldc,1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+JMP .L5_loopE;
+.align 32
+.L8_loopEx:;
+EXTRA_DY $1,yvec15,xvec7;
+EXTRA_DY $1,yvec13,xvec5;
+EXTRA_DY $1,yvec11,xvec3;
+EXTRA_DY $1,yvec9,xvec1;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec14;
+LDH_DX 1*SIZE(C0), xvec14;
+LDL_DX 0*SIZE(C0, ldc, 1), xvec12;
+LDH_DX 1*SIZE(C0, ldc, 1), xvec12;
+LDL_DX 0*SIZE(C1), xvec10;
+LDH_DX 1*SIZE(C1), xvec10;
+LDL_DX 0*SIZE(C1, ldc, 1), xvec8;
+LDH_DX 1*SIZE(C1, ldc, 1), xvec8;
+ADD_DX xvec14, xvec15;
+ADD_DX xvec12, xvec13;
+ADD_DX xvec10, xvec11;
+ADD_DX xvec8, xvec9;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec13, 0*SIZE(C0, ldc, 1);
+STH_DX xvec13, 1*SIZE(C0, ldc, 1);
+STL_DX xvec11, 0*SIZE(C1);
+STH_DX xvec11, 1*SIZE(C1);
+STL_DX xvec9, 0*SIZE(C1, ldc, 1);
+STH_DX xvec9, 1*SIZE(C1, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_DX 2*SIZE(C0), xvec0;
+LDH_DX 3*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C0, ldc, 1), xvec2;
+LDH_DX 3*SIZE(C0, ldc, 1), xvec2;
+LDL_DX 2*SIZE(C1), xvec4;
+LDH_DX 3*SIZE(C1), xvec4;
+LDL_DX 2*SIZE(C1, ldc, 1), xvec6;
+LDH_DX 3*SIZE(C1, ldc, 1), xvec6;
+ADD_DX xvec0, xvec3;
+ADD_DX xvec2, xvec1;
+ADD_DX xvec4, xvec7;
+ADD_DX xvec6, xvec5;
+#endif
+STL_DX xvec3, 2*SIZE(C0);
+STH_DX xvec3, 3*SIZE(C0);
+STL_DX xvec1, 2*SIZE(C0, ldc, 1);
+STH_DX xvec1, 3*SIZE(C0, ldc, 1);
+STL_DX xvec7, 2*SIZE(C1);
+STH_DX xvec7, 3*SIZE(C1);
+STL_DX xvec5, 2*SIZE(C1, ldc, 1);
+STH_DX xvec5, 3*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $4, kk
+#endif
+
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+.L5_loopE:;
+TEST $2, bm;
+JLE .L9_loopE;
+.align 32
+.L9_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb
+#endif
+#### Initial Results Register ####
+LD_DX 0*SIZE(ptrbb), xvec2;
+XOR_DY yvec15, yvec15, yvec15;
+LD_DX 2*SIZE(ptrbb), xvec3;
+XOR_DY yvec13, yvec13, yvec13;
+LD_DX 0*SIZE(ptrba), xvec0;
+XOR_DY yvec11, yvec11, yvec11;
+SHUF_DX $0x4e, xvec2, xvec4;
+XOR_DY yvec9, yvec9, yvec9;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L10_loopE;
+.align 32;
+.L10_bodyB:;
+# Computing kernel
+
+##### Unroll time 1 ####
+LD_DX 4*SIZE(ptrbb), xvec6;
+SHUF_DX $0x4e, xvec3, xvec5;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+LD_DX 6*SIZE(ptrbb), xvec7;
+MUL_DX xvec0, xvec3;
+ADD_DX xvec3, xvec11;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec13;
+SHUF_DX $0x4e, xvec6, xvec4;
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec9;
+
+#### Unroll time 2 ####
+LD_DX 8*SIZE(ptrbb), xvec2;
+SHUF_DX $0x4e, xvec7, xvec5;
+MUL_DX xvec1, xvec6;
+ADD_DX xvec6, xvec15;
+
+LD_DX 10*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec7;
+ADD_DX xvec7, xvec11;
+
+LD_DX 4*SIZE(ptrba), xvec0;
+MUL_DX xvec1, xvec4;
+ADD_DX xvec4, xvec13;
+SHUF_DX $0x4e, xvec2, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec9;
+
+##### Unroll time 3 ####
+LD_DX 12*SIZE(ptrbb), xvec6;
+SHUF_DX $0x4e, xvec3, xvec5;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+LD_DX 14*SIZE(ptrbb), xvec7;
+MUL_DX xvec0, xvec3;
+ADD_DX xvec3, xvec11;
+ADDQ $16*SIZE, ptrbb;
+
+LD_DX 6*SIZE(ptrba), xvec1;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec13;
+SHUF_DX $0x4e, xvec6, xvec4;
+ADDQ $8*SIZE, ptrba;
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec9;
+
+#### Unroll time 4 ####
+LD_DX 0*SIZE(ptrbb), xvec2;
+SHUF_DX $0x4e, xvec7, xvec5;
+MUL_DX xvec1, xvec6;
+ADD_DX xvec6, xvec15;
+
+LD_DX 2*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec7;
+ADD_DX xvec7, xvec11;
+
+LD_DX 0*SIZE(ptrba), xvec0;
+MUL_DX xvec1, xvec4;
+ADD_DX xvec4, xvec13;
+SHUF_DX $0x4e, xvec2, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec9;
+DECQ k;
+JG .L10_bodyB;
+.align 32
+.L10_loopE:;
+#ifndef TRMMKERNEL
+TEST $2, bk
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L11_loopE;
+.align 32
+.L11_bodyB:;
+##### Unroll time 1 ####
+LD_DX 4*SIZE(ptrbb), xvec6;
+SHUF_DX $0x4e, xvec3, xvec5;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+LD_DX 6*SIZE(ptrbb), xvec7;
+MUL_DX xvec0, xvec3;
+ADD_DX xvec3, xvec11;
+ADDQ $8*SIZE, ptrbb;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec13;
+SHUF_DX $0x4e, xvec6, xvec4;
+ADDQ $4*SIZE, ptrba;
+
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec9;
+
+#### Unroll time 2 ####
+LD_DX 0*SIZE(ptrbb), xvec2;
+SHUF_DX $0x4e, xvec7, xvec5;
+MUL_DX xvec1, xvec6;
+ADD_DX xvec6, xvec15;
+
+LD_DX 2*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec7;
+ADD_DX xvec7, xvec11;
+
+LD_DX 0*SIZE(ptrba), xvec0;
+MUL_DX xvec1, xvec4;
+ADD_DX xvec4, xvec13;
+SHUF_DX $0x4e, xvec2, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec9;
+
+.L11_loopE:;
+#ifndef TRMMKERNEL
+TEST $1, bk
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L12_loopE;
+.align 32
+.L12_bodyB:;
+SHUF_DX $0x4e, xvec3, xvec5;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+ADDQ $4*SIZE, ptrbb;
+
+MUL_DX xvec0, xvec3;
+ADD_DX xvec3, xvec11;
+ADDQ $2*SIZE, ptrba;
+
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec13;
+
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec9;
+
+.L12_loopE:;
+#### Load Alpha ####
+BROAD_DX MEMALPHA, xvec7;
+#### Multiply Alpha ####
+MUL_DX xvec7, xvec15;
+MUL_DX xvec7, xvec13;
+MUL_DX xvec7, xvec11;
+MUL_DX xvec7, xvec9;
+#### Reverse the Results ####
+MOV_DX xvec15, xvec6;
+REVS_DX xvec13, xvec15;
+REVS_DX xvec6, xvec13;
+MOV_DX xvec11, xvec6;
+REVS_DX xvec9, xvec11;
+REVS_DX xvec6, xvec9;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L12_loopEx;
+.align 32
+#### Writing Back ####
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec13;
+ADD_DX 0*SIZE(C0, ldc, 1), xvec15;
+ADD_DX 0*SIZE(C1), xvec9;
+ADD_DX 0*SIZE(C1, ldc, 1), xvec11;
+#endif
+ST_DX xvec13, 0*SIZE(C0);
+ST_DX xvec15, 0*SIZE(C0, ldc, 1);
+ST_DX xvec9, 0*SIZE(C1);
+ST_DX xvec11, 0*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk
+#endif
+ADDQ $2*SIZE, C0
+ADDQ $2*SIZE, C1
+JMP .L9_loopE;
+.align 32
+.L12_loopEx:
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec14;
+LDH_DX 1*SIZE(C0), xvec14;
+LDL_DX 0*SIZE(C0, ldc, 1), xvec12;
+LDH_DX 1*SIZE(C0, ldc, 1), xvec12;
+LDL_DX 0*SIZE(C1), xvec10;
+LDH_DX 1*SIZE(C1), xvec10;
+LDL_DX 0*SIZE(C1, ldc, 1), xvec8;
+LDH_DX 1*SIZE(C1, ldc, 1), xvec8;
+ADD_DX xvec14, xvec13;
+ADD_DX xvec12, xvec15;
+ADD_DX xvec10, xvec9;
+ADD_DX xvec8, xvec11;
+#endif
+STL_DX xvec13, 0*SIZE(C0);
+STH_DX xvec13, 1*SIZE(C0);
+STL_DX xvec15, 0*SIZE(C0, ldc, 1);
+STH_DX xvec15, 1*SIZE(C0, ldc, 1);
+STL_DX xvec9, 0*SIZE(C1);
+STH_DX xvec9, 1*SIZE(C1);
+STL_DX xvec11, 0*SIZE(C1, ldc, 1);
+STH_DX xvec11, 1*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+.L9_loopE:;
+TEST $1, bm
+JLE .L13_loopE;
+.align 32
+.L13_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L14_loopE;
+.align 32
+.L14_bodyB:;
+BROAD_DY 0*SIZE(ptrba), yvec0;
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+
+BROAD_DY 1*SIZE(ptrba), yvec1;
+LD_DY 4*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD_DY yvec15, yvec7, yvec15;
+
+BROAD_DY 2*SIZE(ptrba), yvec0;
+LD_DY 8*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+
+BROAD_DY 3*SIZE(ptrba), yvec1;
+LD_DY 12*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD_DY yvec15, yvec7, yvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L14_bodyB;
+.align 32
+.L14_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L15_loopE;
+.align 32
+.L15_bodyB:
+BROAD_DY 0*SIZE(ptrba), yvec0;
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+
+BROAD_DY 1*SIZE(ptrba), yvec1;
+LD_DY 4*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD_DY yvec15, yvec7, yvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+.L15_loopE:;
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L16_loopE;
+.align 32
+.L16_bodyB:;
+BROAD_DY 0*SIZE(ptrba), yvec0;
+LD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD_DY yvec15, yvec6, yvec15;
+ADDQ $1*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L16_loopE:
+#### Load Alpha ####
+BROAD_DY MEMALPHA, yvec7;
+#### Multiply Alpha ####
+MUL_DY yvec15, yvec7, yvec15;
+#### Writing Back ####
+EXTRA_DY $1, yvec15, xvec7;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 0*SIZE(C0, ldc, 1), xvec0;
+LDL_DX 0*SIZE(C1), xvec1;
+LDH_DX 0*SIZE(C1, ldc, 1), xvec1;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 0*SIZE(C0, ldc, 1);
+STL_DX xvec7, 0*SIZE(C1);
+STH_DX xvec7, 0*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $1, kk
+#endif
+ADDQ $1*SIZE, C0
+ADDQ $1*SIZE, C1
+.L13_loopE:;
+#if defined(TRMMKERNEL)&&!defined(LEFT)
+ADDQ $4, kk
+#endif
+MOVQ bk,k;
+SALQ $5,k;
+ADDQ k,bb;
+LEAQ (C,ldc,4),C;
+.L0_bodyE:;
+DECQ j;
+JG .L0_bodyB;
+.align 32;
+.L0_loopE:;
+TEST $2, bn;
+JLE .L20_loopE;
+.align 32;
+.L20_loopB:;
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk
+#endif
+MOVQ C, C0;
+LEAQ (C, ldc, 1), C1;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i; # Rm = 8
+JLE .L21_loopE;
+.align 32;
+.L21_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+XOR_DY yvec13, yvec13, yvec13;
+XOR_DY yvec12, yvec12, yvec12;
+XOR_DY yvec11, yvec11, yvec11;
+XOR_DY yvec10, yvec10, yvec10;
+XOR_DY yvec9, yvec9, yvec9;
+XOR_DY yvec8, yvec8, yvec8;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L211_loopE;
+.align 32;
+.L211_bodyB:
+# Computing kernel
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 4*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 6*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+
+#### Unroll time 2 ####
+LD_DX 8*SIZE(ptrba), xvec0;
+LD_DX 2*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 10*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 12*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 14*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+
+#### Unroll time 3 ####
+LD_DX 16*SIZE(ptrba), xvec0;
+LD_DX 4*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 18*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 20*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 22*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+
+#### Unroll time 4 ####
+LD_DX 24*SIZE(ptrba), xvec0;
+LD_DX 6*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $8*SIZE, ptrbb;
+
+LD_DX 26*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 28*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 30*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+ADDQ $32*SIZE, ptrba;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+DECQ k;
+JG .L211_bodyB;
+.align 32
+.L211_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L212_loopE;
+.align 32;
+.L212_bodyB:
+# Computing kernel
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 4*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 6*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+
+#### Unroll time 2 ####
+LD_DX 8*SIZE(ptrba), xvec0;
+LD_DX 2*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $4*SIZE, ptrbb;
+
+LD_DX 10*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 12*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 14*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+ADDQ $16*SIZE, ptrba;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+
+.L212_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L213_loopE;
+.align 32
+.L213_bodyB:
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $2*SIZE, ptrbb;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+LD_DX 4*SIZE(ptrba), xvec2;
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec13;
+
+LD_DX 6*SIZE(ptrba), xvec3;
+SHUF_DX $0x4e, xvec7, xvec4;
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec12;
+ADDQ $8*SIZE, ptrba;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MOV_DX xvec5, xvec6;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+MOV_DX xvec6, xvec7;
+MUL_DX xvec2, xvec6;
+ADD_DX xvec6, xvec9;
+
+MUL_DX xvec3, xvec7;
+ADD_DX xvec7, xvec8;
+
+.L213_loopE:
+#### Multiply Alpha ####
+BROAD_DX MEMALPHA, xvec7;
+MUL_DX xvec7, xvec15;
+MUL_DX xvec7, xvec14;
+MUL_DX xvec7, xvec13;
+MUL_DX xvec7, xvec12;
+MUL_DX xvec7, xvec11;
+MUL_DX xvec7, xvec10;
+MUL_DX xvec7, xvec9;
+MUL_DX xvec7, xvec8;
+#### Reverse #####
+MOV_DX xvec15, xvec6;
+REVS_DX xvec11, xvec15;
+REVS_DX xvec6, xvec11;
+MOV_DX xvec14, xvec6;
+REVS_DX xvec10, xvec14;
+REVS_DX xvec6, xvec10;
+MOV_DX xvec13, xvec6;
+REVS_DX xvec9, xvec13;
+REVS_DX xvec6, xvec9;
+MOV_DX xvec12, xvec6;
+REVS_DX xvec8, xvec12;
+REVS_DX xvec6, xvec8;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L213_loopEx;
+.align 32
+#### Writing Back ####
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec11;
+ADD_DX 2*SIZE(C0), xvec10;
+ADD_DX 4*SIZE(C0), xvec9;
+ADD_DX 6*SIZE(C0), xvec8;
+ADD_DX 0*SIZE(C1), xvec15;
+ADD_DX 2*SIZE(C1), xvec14;
+ADD_DX 4*SIZE(C1), xvec13;
+ADD_DX 6*SIZE(C1), xvec12;
+#endif
+ST_DX xvec11, 0*SIZE(C0);
+ST_DX xvec10, 2*SIZE(C0);
+ST_DX xvec9, 4*SIZE(C0);
+ST_DX xvec8, 6*SIZE(C0);
+ST_DX xvec15, 0*SIZE(C1);
+ST_DX xvec14, 2*SIZE(C1);
+ST_DX xvec13, 4*SIZE(C1);
+ST_DX xvec12, 6*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L21_bodyB;
+JMP .L21_loopE;
+.align 32
+.L213_loopEx:;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C0), xvec1;
+LDH_DX 3*SIZE(C0), xvec1;
+LDL_DX 4*SIZE(C0), xvec2;
+LDH_DX 5*SIZE(C0), xvec2;
+LDL_DX 6*SIZE(C0), xvec3;
+LDH_DX 7*SIZE(C0), xvec3;
+ADD_DX xvec0, xvec11;
+ADD_DX xvec1, xvec10;
+ADD_DX xvec2, xvec9;
+ADD_DX xvec3, xvec8;
+#endif
+STL_DX xvec11, 0*SIZE(C0);
+STH_DX xvec11, 1*SIZE(C0);
+STL_DX xvec10, 2*SIZE(C0);
+STH_DX xvec10, 3*SIZE(C0);
+STL_DX xvec9, 4*SIZE(C0);
+STH_DX xvec9, 5*SIZE(C0);
+STL_DX xvec8, 6*SIZE(C0);
+STH_DX xvec8, 7*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C1), xvec4;
+LDH_DX 1*SIZE(C1), xvec4;
+LDL_DX 2*SIZE(C1), xvec5;
+LDH_DX 3*SIZE(C1), xvec5;
+LDL_DX 4*SIZE(C1), xvec6;
+LDH_DX 5*SIZE(C1), xvec6;
+LDL_DX 6*SIZE(C1), xvec7;
+LDH_DX 7*SIZE(C1), xvec7;
+ADD_DX xvec4, xvec15;
+ADD_DX xvec5, xvec14;
+ADD_DX xvec6, xvec13;
+ADD_DX xvec7, xvec12;
+#endif
+STL_DX xvec15, 0*SIZE(C1);
+STH_DX xvec15, 1*SIZE(C1);
+STL_DX xvec14, 2*SIZE(C1);
+STH_DX xvec14, 3*SIZE(C1);
+STL_DX xvec13, 4*SIZE(C1);
+STH_DX xvec13, 5*SIZE(C1);
+STL_DX xvec12, 6*SIZE(C1);
+STH_DX xvec12, 7*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L21_bodyB;
+.L21_loopE:;
+TEST $4, bm; # Rm = 4
+JLE .L22_loopE;
+.align 32;
+.L22_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+XOR_DY yvec11, yvec11, yvec11;
+XOR_DY yvec10, yvec10, yvec10;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L221_loopE;
+.align 32
+.L221_bodyB:;
+# Computing kernel
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+#### Unroll time 2 ####
+LD_DX 4*SIZE(ptrba), xvec0;
+LD_DX 2*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 6*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+#### Unroll time 3 ####
+LD_DX 8*SIZE(ptrba), xvec0;
+LD_DX 4*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 10*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+#### Unroll time 4 ####
+LD_DX 12*SIZE(ptrba), xvec0;
+LD_DX 6*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $8*SIZE, ptrbb;
+
+LD_DX 14*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+ADDQ $16*SIZE, ptrba;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+DECQ k;
+JG .L221_bodyB;
+.align 32
+.L221_loopE:;
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L222_loopE;
+.align 32
+.L222_bodyB:
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+#### Unroll time 2 ####
+LD_DX 4*SIZE(ptrba), xvec0;
+LD_DX 2*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $4*SIZE, ptrbb;
+
+LD_DX 6*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+ADDQ $8*SIZE, ptrba;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+.L222_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L223_loopE;
+.align 32
+.L223_bodyB:
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $2*SIZE, ptrbb;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+SHUF_DX $0x4e, xvec5, xvec4;
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec14;
+ADDQ $4*SIZE, ptrba;
+
+MOV_DX xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec11;
+
+MUL_DX xvec1, xvec5;
+ADD_DX xvec5, xvec10;
+
+.L223_loopE:
+#### Multiply Alpha ####
+BROAD_DX MEMALPHA, xvec7;
+MUL_DX xvec7, xvec15;
+MUL_DX xvec7, xvec14;
+MUL_DX xvec7, xvec11;
+MUL_DX xvec7, xvec10;
+#### Reverse #####
+MOV_DX xvec15, xvec6;
+REVS_DX xvec11, xvec15;
+REVS_DX xvec6, xvec11;
+MOV_DX xvec14, xvec6;
+REVS_DX xvec10, xvec14;
+REVS_DX xvec6, xvec10;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L223_loopEx;
+.align 32
+#### Writing Back ####
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec11;
+ADD_DX 2*SIZE(C0), xvec10;
+ADD_DX 0*SIZE(C1), xvec15;
+ADD_DX 2*SIZE(C1), xvec14;
+#endif
+ST_DX xvec11, 0*SIZE(C0);
+ST_DX xvec10, 2*SIZE(C0);
+ST_DX xvec15, 0*SIZE(C1);
+ST_DX xvec14, 2*SIZE(C1);
+#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+JMP .L22_loopE;
+.align 32
+.L223_loopEx:;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C0), xvec1;
+LDH_DX 3*SIZE(C0), xvec1;
+ADD_DX xvec0, xvec11;
+ADD_DX xvec1, xvec10;
+#endif
+STL_DX xvec11, 0*SIZE(C0);
+STH_DX xvec11, 1*SIZE(C0);
+STL_DX xvec10, 2*SIZE(C0);
+STH_DX xvec10, 3*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C1), xvec4;
+LDH_DX 1*SIZE(C1), xvec4;
+LDL_DX 2*SIZE(C1), xvec5;
+LDH_DX 3*SIZE(C1), xvec5;
+ADD_DX xvec4, xvec15;
+ADD_DX xvec5, xvec14;
+#endif
+STL_DX xvec15, 0*SIZE(C1);
+STH_DX xvec15, 1*SIZE(C1);
+STL_DX xvec14, 2*SIZE(C1);
+STH_DX xvec14, 3*SIZE(C1);
+#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+.L22_loopE:;
+TEST $2, bm; # Rm = 2
+JLE .L23_loopE;
+.align 32;
+.L23_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec11, yvec11, yvec11;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L231_loopE;
+.align 32
+.L231_bodyB:
+# Computing kernel
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+#### Unroll time 2 ####
+LD_DX 2*SIZE(ptrba), xvec0;
+LD_DX 2*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+#### Unroll time 3 ####
+LD_DX 4*SIZE(ptrba), xvec0;
+LD_DX 4*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+#### Unroll time 4 ####
+LD_DX 6*SIZE(ptrba), xvec0;
+LD_DX 6*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $8*SIZE, ptrba;
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L231_bodyB;
+.align 32
+.L231_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L232_loopE;
+.align 32
+.L232_bodyB:
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+#### Unroll time 2 ####
+LD_DX 2*SIZE(ptrba), xvec0;
+LD_DX 2*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $4*SIZE, ptrba;
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+ADDQ $4*SIZE, ptrbb;
+.L232_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L233_loopE;
+.align 32
+.L233_bodyB:
+#### Unroll time 1 ####
+LD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec4;
+SHUF_DX $0x4e, xvec4, xvec5;
+MUL_DX xvec0, xvec4;
+ADD_DX xvec4, xvec15;
+ADDQ $2*SIZE, ptrba;
+MUL_DX xvec0, xvec5;
+ADD_DX xvec5, xvec11;
+ADDQ $2*SIZE, ptrbb;
+.L233_loopE:
+#### Multiply Alpha ####
+BROAD_DX MEMALPHA, xvec7;
+MUL_DX xvec7, xvec15;
+MUL_DX xvec7, xvec11;
+#### Reverse #####
+MOV_DX xvec15, xvec6;
+REVS_DX xvec11, xvec15;
+REVS_DX xvec6, xvec11;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L233_loopEx;
+.align 32
+#### Writing Back ####
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec11;
+ADD_DX 0*SIZE(C1), xvec15;
+#endif
+ST_DX xvec11, 0*SIZE(C0);
+ST_DX xvec15, 0*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+JMP .L23_loopE;
+.align 32
+.L233_loopEx:;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+ADD_DX xvec0, xvec11;
+#endif
+STL_DX xvec11, 0*SIZE(C0);
+STH_DX xvec11, 1*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C1), xvec4;
+LDH_DX 1*SIZE(C1), xvec4;
+ADD_DX xvec4, xvec15;
+#endif
+STL_DX xvec15, 0*SIZE(C1);
+STH_DX xvec15, 1*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+.L23_loopE:
+TEST $1, bm; # Rm = 1
+JLE .L24_loopE;
+.align 32;
+.L24_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L241_loopE;
+.align 32
+.L241_bodyB:
+BROAD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+BROAD_DX 1*SIZE(ptrba), xvec1;
+LD_DX 2*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADD_DX xvec3, xvec15;
+
+BROAD_DX 2*SIZE(ptrba), xvec0;
+LD_DX 4*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+BROAD_DX 3*SIZE(ptrba), xvec1;
+LD_DX 6*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADD_DX xvec3, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L241_bodyB;
+.align 32
+.L241_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L242_loopE;
+.align 32
+.L242_bodyB:
+BROAD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+BROAD_DX 1*SIZE(ptrba), xvec1;
+LD_DX 2*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADD_DX xvec3, xvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+.L242_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L243_loopE;
+.align 32
+.L243_bodyB:
+BROAD_DX 0*SIZE(ptrba), xvec0;
+LD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+ADDQ $1*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L243_loopE:
+BROAD_DX MEMALPHA, xvec7;
+MUL_DX xvec7, xvec15;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 0*SIZE(C1), xvec0;
+ADD_DX xvec0, xvec15;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 0*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+ADDQ $1*SIZE, C0;
+ADDQ $1*SIZE, C1;
+.L24_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $2, kk;
+#endif
+MOVQ bk, k;
+SALQ $4, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 2), C;
+.L20_loopE:;
+TEST $1, bn; # Rn = 1
+JLE .L30_loopE;
+.align 32
+.L30_bodyB:
+#if defined(TRMMKERNEL)&&defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C, C0;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i;
+JLE .L31_loopE;
+.align 32
+.L31_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L311_loopE;
+.align 32
+.L311_bodyB:
+#### Unroll time 1 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+LD_DY 4*SIZE(ptrba), yvec1;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec2, yvec0, yvec0;
+ADD_DY yvec0, yvec15, yvec15;
+MUL_DY yvec2, yvec1, yvec1;
+ADD_DY yvec1, yvec14, yvec14;
+
+#### Unroll time 2 ####
+LD_DY 8*SIZE(ptrba), yvec3;
+LD_DY 12*SIZE(ptrba), yvec4;
+BROAD_DY 1*SIZE(ptrbb), yvec5;
+MUL_DY yvec5, yvec3, yvec3;
+ADD_DY yvec3, yvec15, yvec15;
+MUL_DY yvec5, yvec4, yvec4
+ADD_DY yvec4, yvec14, yvec14;
+
+#### Unroll time 3 ####
+LD_DY 16*SIZE(ptrba), yvec0;
+LD_DY 20*SIZE(ptrba), yvec1;
+BROAD_DY 2*SIZE(ptrbb), yvec2;
+MUL_DY yvec2, yvec0, yvec0;
+ADD_DY yvec0, yvec15, yvec15;
+MUL_DY yvec2, yvec1, yvec1;
+ADD_DY yvec1, yvec14, yvec14;
+
+#### Unroll time 2 ####
+LD_DY 24*SIZE(ptrba), yvec3;
+LD_DY 28*SIZE(ptrba), yvec4;
+BROAD_DY 3*SIZE(ptrbb), yvec5;
+MUL_DY yvec5, yvec3, yvec3;
+ADD_DY yvec3, yvec15, yvec15;
+ADDQ $32*SIZE, ptrba;
+MUL_DY yvec5, yvec4, yvec4;
+ADD_DY yvec4, yvec14, yvec14;
+ADDQ $4*SIZE, ptrbb;
+DECQ k;
+JG .L311_bodyB;
+.align 32
+.L311_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L312_loopE;
+.align 32
+.L312_bodyB:
+#### Unroll time 1 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+LD_DY 4*SIZE(ptrba), yvec1;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec2, yvec0, yvec0;
+ADD_DY yvec0, yvec15, yvec15;
+MUL_DY yvec2, yvec1, yvec1;
+ADD_DY yvec1, yvec14, yvec14;
+
+#### Unroll time 2 ####
+LD_DY 8*SIZE(ptrba), yvec3;
+LD_DY 12*SIZE(ptrba), yvec4;
+BROAD_DY 1*SIZE(ptrbb), yvec5;
+MUL_DY yvec5, yvec3, yvec3;
+ADD_DY yvec3, yvec15, yvec15;
+ADDQ $16*SIZE, ptrba;
+MUL_DY yvec5, yvec4, yvec4
+ADD_DY yvec4, yvec14, yvec14;
+ADDQ $2*SIZE, ptrbb;
+
+.L312_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L313_loopE;
+.align 32
+.L313_bodyB:
+#### Unroll time 1 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+LD_DY 4*SIZE(ptrba), yvec1;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec2, yvec0, yvec0;
+ADD_DY yvec0, yvec15, yvec15;
+ADDQ $8*SIZE, ptrba;
+MUL_DY yvec2, yvec1, yvec1;
+ADD_DY yvec1, yvec14, yvec14;
+ADDQ $1*SIZE, ptrbb;
+
+.L313_loopE:
+#### Multiply Alpha ####
+BROAD_DY MEMALPHA, yvec7;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec7, yvec14, yvec14;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L313_loopEx;
+.align 32
+#### Writing Back ####
+EXTRA_DY $1, yvec15, xvec13;
+EXTRA_DY $1, yvec14, xvec12;
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec15;
+ADD_DX 2*SIZE(C0), xvec13;
+ADD_DX 4*SIZE(C0), xvec14;
+ADD_DX 6*SIZE(C0), xvec12;
+#endif
+ST_DX xvec15, 0*SIZE(C0);
+ST_DX xvec13, 2*SIZE(C0);
+ST_DX xvec14, 4*SIZE(C0);
+ST_DX xvec12, 6*SIZE(C0);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $8, kk;
+#endif
+ADDQ $8*SIZE, C0;
+DECQ i;
+JG .L31_bodyB;
+JMP .L31_loopE;
+.align 32
+.L313_loopEx:
+EXTRA_DY $1, yvec15, xvec13;
+EXTRA_DY $1, yvec14, xvec12;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec11;
+LDH_DX 1*SIZE(C0), xvec11;
+LDL_DX 2*SIZE(C0), xvec10;
+LDH_DX 3*SIZE(C0), xvec10;
+LDL_DX 4*SIZE(C0), xvec9;
+LDH_DX 5*SIZE(C0), xvec9;
+LDL_DX 6*SIZE(C0), xvec8;
+LDH_DX 7*SIZE(C0), xvec8;
+ADD_DX xvec11, xvec15;
+ADD_DX xvec10, xvec13;
+ADD_DX xvec9, xvec14;
+ADD_DX xvec8, xvec12;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec13, 2*SIZE(C0);
+STH_DX xvec13, 3*SIZE(C0);
+STL_DX xvec14, 4*SIZE(C0);
+STH_DX xvec14, 5*SIZE(C0);
+STL_DX xvec12, 6*SIZE(C0);
+STH_DX xvec12, 7*SIZE(C0);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $8, kk;
+#endif
+ADDQ $8*SIZE, C0;
+DECQ i;
+JG .L31_bodyB;
+.L31_loopE:
+TEST $4, bm
+JLE .L32_loopE;
+.align 32
+.L32_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk
+#endif
+SARQ $2, k;
+JLE .L321_loopE;
+.align 32
+.L321_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec1;
+MUL_DY yvec0, yvec1, yvec1;
+ADD_DY yvec1, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec2;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec2, yvec3, yvec3;
+ADD_DY yvec3, yvec15, yvec15;
+
+LD_DY 8*SIZE(ptrba), yvec4;
+BROAD_DY 2*SIZE(ptrbb), yvec5;
+MUL_DY yvec4, yvec5, yvec5;
+ADD_DY yvec5, yvec15, yvec15;
+
+LD_DY 12*SIZE(ptrba), yvec6;
+BROAD_DY 3*SIZE(ptrbb), yvec7;
+MUL_DY yvec6, yvec7, yvec7;
+ADD_DY yvec7, yvec15, yvec15;
+ADDQ $16*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+DECQ k;
+JG .L321_bodyB;
+.align 32
+.L321_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L322_loopE;
+.align 32
+.L322_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec1;
+MUL_DY yvec0, yvec1, yvec1;
+ADD_DY yvec1, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec2;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec2, yvec3, yvec3;
+ADD_DY yvec3, yvec15, yvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L322_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L323_loopE;
+.align 32
+.L323_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec1;
+MUL_DY yvec0, yvec1, yvec1;
+ADD_DY yvec1, yvec15, yvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $1*SIZE, ptrbb;
+
+.L323_loopE:
+#### Multiply Alpha ####
+BROAD_DY MEMALPHA, yvec7;
+MUL_DY yvec7, yvec15, yvec15;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L323_loopEx;
+.align 32
+#### Writing Back ####
+EXTRA_DY $1, yvec15, xvec14;
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec15;
+ADD_DX 2*SIZE(C0), xvec14;
+#endif
+ST_DX xvec15, 0*SIZE(C0);
+ST_DX xvec14, 2*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+JMP .L32_loopE;
+.align 32
+.L323_loopEx:
+#### Writing Back ####
+EXTRA_DY $1, yvec15, xvec14;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec13;
+LDH_DX 1*SIZE(C0), xvec13;
+LDL_DX 2*SIZE(C0), xvec12;
+LDH_DX 3*SIZE(C0), xvec12;
+ADD_DX xvec13, xvec15;
+ADD_DX xvec12, xvec14;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec14, 2*SIZE(C0);
+STH_DX xvec14, 3*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+.L32_loopE:
+TEST $2, bm
+JLE .L33_loopE;
+.align 32
+.L33_bodyB:
+#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax
+LEAQ (, %rax, SIZE), %rax
+LEAQ (ptrba, %rax, 2), ptrba
+ADDQ %rax, ptrbb;
+#endif
+#### Initial Result ####
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L331_loopE;
+.align 32
+.L331_bodyB:
+LD_DX 0*SIZE(ptrba), xvec0;
+BROAD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+BROAD_DX 1*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADD_DX xvec3, xvec15;
+
+LD_DX 4*SIZE(ptrba), xvec4;
+BROAD_DX 2*SIZE(ptrbb), xvec5;
+MUL_DX xvec4, xvec5;
+ADD_DX xvec5, xvec15;
+
+LD_DX 6*SIZE(ptrba), xvec6;
+BROAD_DX 3*SIZE(ptrbb), xvec7;
+MUL_DX xvec6, xvec7;
+ADD_DX xvec7, xvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+DECQ k;
+JG .L331_bodyB;
+.align 32
+.L331_loopE:
+#ifndef TRMMKERNEL
+TEST $2,bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax
+#endif
+JLE .L332_loopE;
+.align 32
+.L332_bodyB:
+LD_DX 0*SIZE(ptrba), xvec0;
+BROAD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec1;
+BROAD_DX 1*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADD_DX xvec3, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+.L332_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L333_loopE;
+.align 32
+.L333_bodyB:
+LD_DX 0*SIZE(ptrba), xvec0;
+BROAD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD_DX xvec2, xvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $1*SIZE, ptrbb;
+.L333_loopE:
+#### Multiply Alpha ####
+BROAD_DX MEMALPHA, xvec7;
+MUL_DX xvec7, xvec15;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec14;
+LDH_DX 1*SIZE(C0), xvec14;
+ADD_DX xvec14, xvec15;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+addq $2, kk
+#endif
+ADDQ $2*SIZE, C0;
+.L33_loopE:
+TEST $1, bm
+JLE .L34_loopE;
+.align 32
+.L34_bodyB:
+#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L341_loopE;
+.align 32
+.L341_bodyB:
+movsd 0*SIZE(ptrba), xvec0;
+movsd 0*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+
+movsd 1*SIZE(ptrba), xvec0;
+movsd 1*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+
+movsd 2*SIZE(ptrba), xvec0;
+movsd 2*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+
+movsd 3*SIZE(ptrba), xvec0;
+movsd 3*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+addq $4*SIZE, ptrba;
+addq $4*SIZE, ptrbb;
+decq k;
+JG .L341_bodyB;
+.align 32
+.L341_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+MOVQ kkk, %rax;
+TEST $2, %rax;
+#endif
+JLE .L342_loopE;
+.align 32
+.L342_bodyB:
+movsd 0*SIZE(ptrba), xvec0;
+movsd 0*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+
+movsd 1*SIZE(ptrba), xvec0;
+movsd 1*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+addq $2*SIZE, ptrba;
+addq $2*SIZE, ptrbb;
+
+.L342_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk
+#else
+MOVQ kkk, %rax;
+TEST $1, %rax;
+#endif
+JLE .L343_loopE;
+.align 32
+.L343_bodyB:
+movsd 0*SIZE(ptrba), xvec0;
+movsd 0*SIZE(ptrbb), xvec1;
+mulsd xvec0, xvec1;
+addsd xvec1, xvec15;
+addq $1*SIZE, ptrba;
+addq $1*SIZE, ptrbb;
+
+.L343_loopE:
+#### Writing Back ####
+movsd MEMALPHA, xvec7;
+mulsd xvec7, xvec15;
+#ifndef TRMMKERNEL
+movsd 0*SIZE(C0), xvec0;
+addsd xvec0, xvec15;
+#endif
+movsd xvec15, 0*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+addq $1, kk
+#endif
+addq $1*SIZE, C0;
+.L34_loopE:
+MOVQ bk, k
+SALQ $3, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 1), C;
+
+.L30_loopE:
+movq 0(%rsp), %rbx;
+movq 8(%rsp), %rbp;
+movq 16(%rsp), %r12;
+movq 24(%rsp), %r13;
+movq 32(%rsp), %r14;
+movq 40(%rsp), %r15;
+#ifdef WINDOWS_ABI
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ movups 64(%rsp), %xmm6
+ movups 80(%rsp), %xmm7
+ movups 96(%rsp), %xmm8
+ movups 112(%rsp), %xmm9
+ movups 128(%rsp), %xmm10
+ movups 144(%rsp), %xmm11
+ movups 160(%rsp), %xmm12
+ movups 176(%rsp), %xmm13
+ movups 192(%rsp), %xmm14
+ movups 208(%rsp), %xmm15
+#endif
+addq $STACKSIZE, %rsp;
+ret
+
+EPILOGUE
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define old_bm %rdi
+#define old_bn %rsi
+#define old_bk %rdx
+
+#define bm %r13
+#define bn %r14
+#define bk %r15
+
+#define ALPHA %xmm0
+#define ba %rcx
+#define bb %r8
+#define C %r9
+#define ldc %r10
+
+#define i %r11
+#define k %rax
+
+#define ptrba %rdi
+#define ptrbb %rsi
+#define C0 %rbx
+#define C1 %rbp
+
+#define prebb %r12
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 128
+
+#define old_ldc 8+STACKSIZE(%rsp)
+#define old_offset 16+STACKSIZE(%rsp)
+#define MEMALPHA 48(%rsp)
+#define j 56(%rsp)
+#define OFFSET 64(%rsp)
+#define kk 72(%rsp)
+#define kkk 80(%rsp)
+
+#else
+
+#define STACKSIZE 512
+
+#define OLD_A 40 + STACKSIZE(%rsp)
+#define OLD_B 48 + STACKSIZE(%rsp)
+#define OLD_C 56 + STACKSIZE(%rsp)
+#define old_ldc 64 + STACKSIZE(%rsp)
+#define old_offset 72 + STACKSIZE(%rsp)
+
+#define MEMALPHA 224(%rsp)
+#define j 232(%rsp)
+#define OFFSET 240(%rsp)
+#define kk 248(%rsp)
+#define kkk 256(%rsp)
+
+#endif
+
+#define PREFETCH0 prefetcht0
+#define PREFETCH1 prefetcht0
+#define PREFETCH2 prefetcht2
+#define PRESIZE 80
+
+#define xvec0 %xmm0
+#define xvec1 %xmm1
+#define xvec2 %xmm2
+#define xvec3 %xmm3
+#define xvec4 %xmm4
+#define xvec5 %xmm5
+#define xvec6 %xmm6
+#define xvec7 %xmm7
+#define xvec8 %xmm8
+#define xvec9 %xmm9
+#define xvec10 %xmm10
+#define xvec11 %xmm11
+#define xvec12 %xmm12
+#define xvec13 %xmm13
+#define xvec14 %xmm14
+#define xvec15 %xmm15
+
+#define yvec0 %ymm0
+#define yvec1 %ymm1
+#define yvec2 %ymm2
+#define yvec3 %ymm3
+#define yvec4 %ymm4
+#define yvec5 %ymm5
+#define yvec6 %ymm6
+#define yvec7 %ymm7
+#define yvec8 %ymm8
+#define yvec9 %ymm9
+#define yvec10 %ymm10
+#define yvec11 %ymm11
+#define yvec12 %ymm12
+#define yvec13 %ymm13
+#define yvec14 %ymm14
+#define yvec15 %ymm15
+
+#define LEAQ leaq
+#define ADDQ addq
+#define MULQ imulq
+#define SARQ sarq
+#define SALQ salq
+#define ANDQ andq
+#define SUBQ subq
+#define DECQ decq
+#define JG jg
+#define JLE jle
+#define TEST testq
+#define OR orq
+#define JNE jne
+#define JMP jmp
+#define NOP
+#define XOR xorpd
+#define MOVQ movq
+
+#define XOR_SY vxorps
+#define XOR_DY vxorpd
+#define XOR_SX xorps
+#define XOR_DX xorpd
+
+#define LD_SY vmovaps
+#define LD_DY vmovapd
+#define LD_SX movaps
+#define LD_DX movapd
+#define LDL_SX movlps
+#define LDL_SY vmovlps
+#define LDH_SX movhps
+#define LDH_SY vmovhps
+
+#define ST_SY vmovaps
+#define ST_DY vmovapd
+#define ST_SX movaps
+#define ST_DX movapd
+#define STL_SX movlps
+#define STL_SY vmovlps
+#define STH_SX movhps
+#define STH_SY vmovhps
+
+#define EDUP_SY vmovsldup
+#define ODUP_SY vmovshdup
+#define EDUP_SX movsldup
+#define ODUP_SX movshdup
+#define EDUP_DY vmovddup
+
+#define ADD_SY vaddps
+#define ADD_DY vaddpd
+#define ADD_SX addps
+#define ADD_DX addpd
+
+#define ADD1_DY vaddpd
+#define ADD2_DY vaddpd
+#define ADDSUB_DY vaddsubpd
+#define ADDSUB_SY vaddsubps
+
+#define MUL_SY vmulps
+#define MUL_DY vmulpd
+#define MUL_SX mulps
+#define MUL_DX mulpd
+
+#define SHUF_SY vperm2f128
+#define SHUF_DY vperm2f128
+#define SHUF_DX pshufd
+#define SHUF_SX pshufd
+
+#define VPERMILP_SY vpermilps
+#define VPERMILP_SX vpermilps
+#define VPERMILP_DY vpermilpd
+
+#define BROAD_SY vbroadcastss
+#define BROAD_DY vbroadcastsd
+#define BROAD_SX vbroadcastss
+#define BROAD_DX movddup
+
+#define MOV_SY vmovaps
+#define MOV_DY vmovapd
+#define MOV_SX movaps
+#define MOV_DX movapd
+
+#define REVS_SY vshufps
+#define REVS_DY vshufpd
+#define REVS_SX shufps
+#define REVS_DX movsd
+
+#define EXTRA_SY vextractf128
+#define EXTRA_DY vextractf128
+
+
+PROLOGUE
+
+subq $STACKSIZE, %rsp;
+movq %rbx, 0(%rsp);
+movq %rbp, 8(%rsp);
+movq %r12, 16(%rsp);
+movq %r13, 24(%rsp);
+movq %r14, 32(%rsp);
+movq %r15, 40(%rsp);
+
+#ifdef WINDOWS_ABI
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ movups %xmm6, 64(%rsp)
+ movups %xmm7, 80(%rsp)
+ movups %xmm8, 96(%rsp)
+ movups %xmm9, 112(%rsp)
+ movups %xmm10, 128(%rsp)
+ movups %xmm11, 144(%rsp)
+ movups %xmm12, 160(%rsp)
+ movups %xmm13, 176(%rsp)
+ movups %xmm14, 192(%rsp)
+ movups %xmm15, 208(%rsp)
+
+ movq ARG1, old_bm
+ movq ARG2, old_bn
+ movq ARG3, old_bk
+ movq OLD_A, ba
+ movq OLD_B, bb
+ movq OLD_C, C
+ movq old_ldc, ldc
+#ifdef TRMMKERNEL
+ movq old_offset, %r11
+#endif
+#else
+
+movq old_ldc, ldc
+#ifdef TRMMKERNEL
+movq old_offset, %r11
+#endif
+#endif
+
+vmovlps ALPHA, MEMALPHA
+movq old_bm, bm
+movq old_bn, bn
+movq old_bk, bk
+leaq (, ldc, SIZE), ldc
+#ifdef TRMMKERNEL
+movq %r11, OFFSET
+#ifndef LEFT
+negq %r11;
+#endif
+movq %r11, kk
+#endif
+MOVQ bn,j;
+SARQ $3,j;
+JLE .L0_loopE;
+.align 16;
+.L0_bodyB:;
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+
+MOVQ C,C0;
+LEAQ (C,ldc,4),C1;
+MOVQ bk, k;
+SALQ $5, k;
+LEAQ (bb, k, 1), prebb;
+MOVQ ba,ptrba;
+MOVQ bm,i;
+SARQ $3,i;
+JLE .L1_loopE;
+.align 16;
+.L1_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_SY yvec15, yvec15, yvec15;
+PREFETCH0 0*SIZE(prebb);
+XOR_SY yvec14, yvec14, yvec14;
+PREFETCH0 16*SIZE(prebb);
+XOR_SY yvec13, yvec13, yvec13;
+PREFETCH0 32*SIZE(prebb);
+XOR_SY yvec12, yvec12, yvec12;
+ADDQ $48*SIZE, prebb;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+LEAQ (ldc, ldc, 2), %rax;
+PREFETCH2 7*SIZE(C0);
+PREFETCH2 7*SIZE(C1);
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec10, yvec10, yvec10;
+PREFETCH2 7*SIZE(C0, ldc, 1);
+PREFETCH2 7*SIZE(C1, ldc, 1);
+LD_SY 0*SIZE(ptrba), yvec0;
+XOR_SY yvec9, yvec9, yvec9;
+PREFETCH2 7*SIZE(C0, ldc, 2);
+PREFETCH2 7*SIZE(C1, ldc, 2);
+XOR_SY yvec8, yvec8, yvec8;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+PREFETCH2 7*SIZE(C0, %rax, 1);
+PREFETCH2 7*SIZE(C1, %rax, 1);
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $8, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2,k;
+JLE .L2_loopE;
+.align 16;
+.L2_bodyB:;
+# Computing kernel
+
+#### Unroll times 1 ####
+PREFETCH0 PRESIZE*SIZE(ptrba);
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 0*SIZE(ptrbb), yvec2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+LD_SY 8*SIZE(ptrba), yvec1;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+EDUP_SY 8*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+
+#### Unroll times 2 ####
+MUL_SY yvec1, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 8*SIZE(ptrbb), yvec2
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+LD_SY 16*SIZE(ptrba), yvec0;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+EDUP_SY 16*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+
+#### Unroll times 3 ####
+PREFETCH0 (PRESIZE+16)*SIZE(ptrba);
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 16*SIZE(ptrbb), yvec2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+LD_SY 24*SIZE(ptrba), yvec1;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+ADDQ $32*SIZE, ptrba;
+
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+EDUP_SY 24*SIZE(ptrbb), yvec2;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+
+#### Unroll times 4 ####
+MUL_SY yvec1, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 24*SIZE(ptrbb), yvec2
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADDQ $32*SIZE, ptrbb;
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+LD_SY 0*SIZE(ptrba), yvec0;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+.L2_bodyE:;
+DECQ k;
+JG .L2_bodyB;
+.align 64;
+.L2_loopE:;
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L3_loopE;
+.align 64
+.L3_loobB:
+#### Unroll times 1 ####
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 0*SIZE(ptrbb), yvec2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+LD_SY 8*SIZE(ptrba), yvec1;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADDQ $16*SIZE, ptrba;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+EDUP_SY 8*SIZE(ptrbb), yvec2;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+
+#### Unroll times 2 ####
+MUL_SY yvec1, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 8*SIZE(ptrbb), yvec2
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADDQ $16*SIZE, ptrbb
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+LD_SY 0*SIZE(ptrba), yvec0;
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+
+MUL_SY yvec1, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+EDUP_SY 0*SIZE(ptrbb), yvec2;
+MUL_SY yvec1, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec1, yvec4, yvec6;
+MUL_SY yvec1, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+.L3_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L4_loopE;
+.align 64
+.L4_loopB:;
+#### Unroll times 1 ####
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+ODUP_SY 0*SIZE(ptrbb), yvec2
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5
+ADDQ $8*SIZE, ptrba;
+ADD_SY yvec15, yvec6, yvec15
+ADD_SY yvec13, yvec7, yvec13;
+
+VPERMILP_SY $0x4e, yvec2, yvec3;
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADDQ $8*SIZE, ptrbb;
+ADD_SY yvec11, yvec6, yvec11;
+ADD_SY yvec9, yvec7, yvec9;
+
+MUL_SY yvec0, yvec2, yvec6;
+SHUF_SY $0x03, yvec2, yvec2, yvec4;
+MUL_SY yvec0, yvec3, yvec7;
+SHUF_SY $0x03, yvec3, yvec3, yvec5;
+ADD_SY yvec14, yvec6, yvec14;
+ADD_SY yvec12, yvec7, yvec12;
+
+MUL_SY yvec0, yvec4, yvec6;
+MUL_SY yvec0, yvec5, yvec7;
+ADD_SY yvec10, yvec6, yvec10;
+ADD_SY yvec8, yvec7, yvec8;
+
+.L4_loopE:;
+#### Load Alpha ####
+BROAD_SY MEMALPHA,yvec7;
+MUL_SY yvec7,yvec15,yvec15;
+MUL_SY yvec7,yvec14,yvec14;
+MUL_SY yvec7,yvec13,yvec13;
+MUL_SY yvec7,yvec12,yvec12;
+MUL_SY yvec7,yvec11,yvec11;
+MUL_SY yvec7,yvec10,yvec10;
+MUL_SY yvec7,yvec9,yvec9;
+MUL_SY yvec7,yvec8,yvec8;
+MOV_SY yvec15,yvec7;
+REVS_SY $0xe4,yvec13,yvec15,yvec15;
+REVS_SY $0xe4,yvec7,yvec13,yvec13;
+MOV_SY yvec14,yvec7;
+REVS_SY $0xe4,yvec12,yvec14,yvec14;
+REVS_SY $0xe4,yvec7,yvec12,yvec12;
+MOV_SY yvec11,yvec7;
+REVS_SY $0xe4,yvec9,yvec11,yvec11;
+REVS_SY $0xe4,yvec7,yvec9,yvec9;
+MOV_SY yvec10,yvec7;
+REVS_SY $0xe4,yvec8,yvec10,yvec10;
+REVS_SY $0xe4,yvec7,yvec8,yvec8;
+##### Testing alignment #####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L4_loopEx;
+.align 16
+LEAQ (ldc,ldc,2),%rax;
+EXTRA_SY $1,yvec15,xvec7;
+EXTRA_SY $1,yvec14,xvec6;
+EXTRA_SY $1,yvec13,xvec5;
+EXTRA_SY $1,yvec12,xvec4;
+EXTRA_SY $1,yvec11,xvec3;
+EXTRA_SY $1,yvec10,xvec2;
+EXTRA_SY $1,yvec9,xvec1;
+EXTRA_SY $1,yvec8,xvec0;
+#ifndef TRMMKERNEL
+ADD_SY 0*SIZE(C0), xvec15, xvec15;
+ADD_SY 4*SIZE(C1), xvec7, xvec7;
+ADD_SY 0*SIZE(C0,ldc,1), xvec14, xvec14;
+ADD_SY 4*SIZE(C1,ldc,1), xvec6, xvec6;
+ADD_SY 0*SIZE(C0,ldc,2), xvec13, xvec13;
+ADD_SY 4*SIZE(C1,ldc,2), xvec5, xvec5;
+ADD_SY 0*SIZE(C0,%rax,1), xvec12, xvec12;
+ADD_SY 4*SIZE(C1,%rax,1), xvec4, xvec4;
+ADD_SY 0*SIZE(C1), xvec11, xvec11;
+ADD_SY 4*SIZE(C0), xvec3, xvec3;
+ADD_SY 0*SIZE(C1,ldc,1), xvec10, xvec10;
+ADD_SY 4*SIZE(C0,ldc,1), xvec2, xvec2;
+ADD_SY 0*SIZE(C1,ldc,2), xvec9, xvec9;
+ADD_SY 4*SIZE(C0,ldc,2), xvec1, xvec1;
+ADD_SY 0*SIZE(C1,%rax,1), xvec8, xvec8;
+ADD_SY 4*SIZE(C0,%rax,1), xvec0, xvec0;
+#endif
+ST_SY xvec15,0*SIZE(C0);
+ST_SY xvec7,4*SIZE(C1);
+ST_SY xvec14,0*SIZE(C0,ldc,1);
+ST_SY xvec6,4*SIZE(C1,ldc,1);
+ST_SY xvec13,0*SIZE(C0,ldc,2);
+ST_SY xvec5,4*SIZE(C1,ldc,2);
+ST_SY xvec12,0*SIZE(C0,%rax,1);
+ST_SY xvec4,4*SIZE(C1,%rax,1);
+ST_SY xvec11,0*SIZE(C1);
+ST_SY xvec3,4*SIZE(C0);
+ST_SY xvec10,0*SIZE(C1,ldc,1);
+ST_SY xvec2,4*SIZE(C0,ldc,1);
+ST_SY xvec9,0*SIZE(C1,ldc,2);
+ST_SY xvec1,4*SIZE(C0,ldc,2);
+ST_SY xvec8,0*SIZE(C1,%rax,1);
+ST_SY xvec0,4*SIZE(C0,%rax,1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE,C0;
+ADDQ $8*SIZE,C1;
+.L1_bodyE:;
+DECQ i;
+JG .L1_bodyB;
+JMP .L1_loopE;
+.align 16;
+.L4_loopEx:
+LEAQ (ldc,ldc,2),%rax;
+EXTRA_SY $1, yvec15, xvec7;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C0), xvec6, xvec6;
+LDH_SY 2*SIZE(C0), xvec6, xvec6;
+ADD_SY xvec6, xvec15, xvec15;
+#endif
+STL_SY xvec15, 0*SIZE(C0);
+STH_SY xvec15, 2*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C1), xvec5, xvec5;
+LDH_SY 6*SIZE(C1), xvec5, xvec5;
+ADD_SY xvec5, xvec7, xvec7;
+#endif
+STL_SY xvec7, 4*SIZE(C1);
+STH_SY xvec7, 6*SIZE(C1);
+
+EXTRA_SY $1, yvec14, xvec6;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C0, ldc, 1), xvec5, xvec5;
+LDH_SY 2*SIZE(C0, ldc, 1), xvec5, xvec5;
+ADD_SY xvec5, xvec14, xvec14;
+#endif
+STL_SY xvec14, 0*SIZE(C0, ldc, 1);
+STH_SY xvec14, 2*SIZE(C0, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C1, ldc, 1), xvec4, xvec4;
+LDH_SY 6*SIZE(C1, ldc, 1), xvec4, xvec4;
+ADD_SY xvec4, xvec6, xvec6;
+#endif
+STL_SY xvec6, 4*SIZE(C1, ldc, 1);
+STH_SY xvec6, 6*SIZE(C1, ldc, 1);
+
+EXTRA_SY $1, yvec13, xvec5;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C0, ldc, 2), xvec4, xvec4;
+LDH_SY 2*SIZE(C0, ldc, 2), xvec4, xvec4;
+ADD_SY xvec4, xvec13, xvec13;
+#endif
+STL_SY xvec13, 0*SIZE(C0, ldc, 2);
+STH_SY xvec13, 2*SIZE(C0, ldc, 2);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C1, ldc, 2), xvec3, xvec3;
+LDH_SY 6*SIZE(C1, ldc, 2), xvec3, xvec3;
+ADD_SY xvec3, xvec5, xvec5;
+#endif
+STL_SY xvec5, 4*SIZE(C1, ldc, 2);
+STH_SY xvec5, 6*SIZE(C1, ldc, 2);
+
+EXTRA_SY $1, yvec12, xvec4;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C0, %rax, 1), xvec3, xvec3;
+LDH_SY 2*SIZE(C0, %rax, 1), xvec3, xvec3;
+ADD_SY xvec3, xvec12, xvec12;
+#endif
+STL_SY xvec12, 0*SIZE(C0, %rax, 1);
+STH_SY xvec12, 2*SIZE(C0, %rax, 1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C1, %rax, 1), xvec2, xvec2;
+LDH_SY 6*SIZE(C1, %rax, 1), xvec2, xvec2;
+ADD_SY xvec2, xvec4, xvec4;
+#endif
+STL_SY xvec4, 4*SIZE(C1, %rax, 1);
+STH_SY xvec4, 6*SIZE(C1, %rax, 1);
+
+EXTRA_SY $1, yvec11, xvec3;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C1), xvec2, xvec2;
+LDH_SY 2*SIZE(C1), xvec2, xvec2;
+ADD_SY xvec2, xvec11, xvec11;
+#endif
+STL_SY xvec11, 0*SIZE(C1);
+STH_SY xvec11, 2*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C0), xvec1, xvec1;
+LDH_SY 6*SIZE(C0), xvec1, xvec1;
+ADD_SY xvec1, xvec3, xvec3;
+#endif
+STL_SY xvec3, 4*SIZE(C0);
+STH_SY xvec3, 6*SIZE(C0);
+
+EXTRA_SY $1, yvec10, xvec2;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C1, ldc, 1), xvec1, xvec1;
+LDH_SY 2*SIZE(C1, ldc, 1), xvec1, xvec1;
+ADD_SY xvec1, xvec10, xvec10;
+#endif
+STL_SY xvec10, 0*SIZE(C1, ldc, 1);
+STH_SY xvec10, 2*SIZE(C1, ldc, 1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C0, ldc, 1), xvec0, xvec0;
+LDH_SY 6*SIZE(C0, ldc, 1), xvec0, xvec0;
+ADD_SY xvec0, xvec2, xvec2;
+#endif
+STL_SY xvec2, 4*SIZE(C0, ldc, 1);
+STH_SY xvec2, 6*SIZE(C0, ldc, 1);
+
+EXTRA_SY $1, yvec9, xvec1;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C1, ldc, 2), xvec0, xvec0;
+LDH_SY 2*SIZE(C1, ldc, 2), xvec0, xvec0;
+ADD_SY xvec0, xvec9, xvec9;
+#endif
+STL_SY xvec9, 0*SIZE(C1, ldc, 2);
+STH_SY xvec9, 2*SIZE(C1, ldc, 2);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C0, ldc, 2), xvec7, xvec7;
+LDH_SY 6*SIZE(C0, ldc, 2), xvec7, xvec7;
+ADD_SY xvec7, xvec1, xvec1;
+#endif
+STL_SY xvec1, 4*SIZE(C0, ldc, 2);
+STH_SY xvec1, 6*SIZE(C0, ldc, 2);
+
+EXTRA_SY $1, yvec8, xvec0;
+#ifndef TRMMKERNEL
+LDL_SY 0*SIZE(C1, %rax, 1), xvec6, xvec6;
+LDH_SY 2*SIZE(C1, %rax, 1), xvec6, xvec6;
+ADD_SY xvec6, xvec8, xvec8;
+#endif
+STL_SY xvec8, 0*SIZE(C1, %rax, 1);
+STH_SY xvec8, 2*SIZE(C1, %rax, 1);
+#ifndef TRMMKERNEL
+LDL_SY 4*SIZE(C0, %rax, 1), xvec5, xvec5;
+LDH_SY 6*SIZE(C0, %rax, 1), xvec5, xvec5;
+ADD_SY xvec5, xvec0, xvec0;
+#endif
+STL_SY xvec0, 4*SIZE(C0, %rax, 1);
+STH_SY xvec0, 6*SIZE(C0, %rax, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L1_bodyB;
+.align 16
+.L1_loopE:;
+TEST $4, bm;
+JLE .L5_loopE;
+.align 16
+.L5_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#### Initial Results Register ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+LD_SX 0*SIZE(ptrba), xvec0;
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec10, yvec10, yvec10;
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+XOR_SY yvec9, yvec9, yvec9;
+XOR_SY yvec8, yvec8, yvec8;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $8, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L8_loopE;
+.align 16
+.L8_bodyB:
+
+#### Unroll time 1 ####
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec1;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec10;
+
+EDUP_SX 8*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec9;
+ODUP_SX 8*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec8;
+
+#### Unroll time 2 ####
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 12*SIZE(ptrbb), xvec2;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 12*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec10;
+
+EDUP_SX 16*SIZE(ptrbb), xvec2;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec9;
+ODUP_SX 16*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec8;
+
+#### Unroll time 3 ####
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 20*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 20*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 12*SIZE(ptrba), xvec1;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec10;
+
+EDUP_SX 24*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec9;
+ODUP_SX 24*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec8;
+ADDQ $16*SIZE, ptrba;
+
+#### Unroll time 4 ####
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 28*SIZE(ptrbb), xvec2;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 28*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $32*SIZE, ptrbb;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec10;
+
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec9;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec8;
+DECQ k;
+JG .L8_bodyB;
+.align 16
+.L8_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L9_loopE;
+.align 16
+.L9_bodyB:
+#### Unroll time 1 ####
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec1;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec10;
+
+EDUP_SX 8*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec9;
+ODUP_SX 8*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec8;
+
+#### Unroll time 2 ####
+ADDQ $8*SIZE, ptrba;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 12*SIZE(ptrbb), xvec2;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 12*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $16*SIZE, ptrbb;
+
+LD_SX 0*SIZE(ptrba), xvec0;
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec10;
+
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec9;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec8;
+
+.L9_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L10_loopE;
+.align 16
+.L10_bodyB:
+#### Unroll time 1 ####
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+ADDQ $4*SIZE, ptrba;
+
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $8*SIZE, ptrbb;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec11;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec10;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec9;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec8;
+
+.L10_loopE:
+#### Multiply Alpha ####
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec7, xvec12;
+MUL_SX xvec7, xvec11;
+MUL_SX xvec7, xvec10;
+MUL_SX xvec7, xvec9;
+MUL_SX xvec7, xvec8;
+#### Reverse Result ####
+MOV_SX xvec15, xvec7;
+REVS_SX $0xe4, xvec13, xvec15;
+REVS_SX $0xe4, xvec7, xvec13;
+MOV_SX xvec14, xvec7;
+REVS_SX $0xe4, xvec12, xvec14;
+REVS_SX $0xe4, xvec7, xvec12;
+MOV_SX xvec11, xvec7;
+REVS_SX $0xe4, xvec9, xvec11;
+REVS_SX $0xe4, xvec7, xvec9;
+MOV_SX xvec10, xvec7;
+REVS_SX $0xe4, xvec8, xvec10;
+REVS_SX $0xe4, xvec7, xvec8;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L10_loopEx;
+.align 16
+LEAQ (ldc,ldc,2),%rax;
+#ifndef TRMMKERNEL
+ADD_SX 0*SIZE(C0), xvec15;
+ADD_SX 0*SIZE(C0, ldc, 1), xvec14;
+ADD_SX 0*SIZE(C0, ldc, 2), xvec13;
+ADD_SX 0*SIZE(C0, %rax, 1), xvec12;
+ADD_SX 0*SIZE(C1), xvec11;
+ADD_SX 0*SIZE(C1, ldc, 1), xvec10;
+ADD_SX 0*SIZE(C1, ldc, 2), xvec9;
+ADD_SX 0*SIZE(C1, %rax, 1), xvec8;
+#endif
+ST_SX xvec15, 0*SIZE(C0);
+ST_SX xvec14, 0*SIZE(C0, ldc, 1);
+ST_SX xvec13, 0*SIZE(C0, ldc, 2);
+ST_SX xvec12, 0*SIZE(C0, %rax, 1);
+ST_SX xvec11, 0*SIZE(C1);
+ST_SX xvec10, 0*SIZE(C1, ldc, 1);
+ST_SX xvec9, 0*SIZE(C1, ldc, 2);
+ST_SX xvec8, 0*SIZE(C1, %rax, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+JMP .L5_loopE;
+.align 16
+.L10_loopEx:
+LEAQ (ldc,ldc,2),%rax;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec7;
+LDH_SX 2*SIZE(C0), xvec7;
+LDL_SX 0*SIZE(C0, ldc, 1), xvec6;
+LDH_SX 2*SIZE(C0, ldc, 1), xvec6;
+LDL_SX 0*SIZE(C0, ldc, 2), xvec5;
+LDH_SX 2*SIZE(C0, ldc, 2), xvec5;
+LDL_SX 0*SIZE(C0, %rax, 1), xvec4;
+LDH_SX 2*SIZE(C0, %rax, 1), xvec4;
+LDL_SX 0*SIZE(C1), xvec3;
+LDH_SX 2*SIZE(C1), xvec3;
+LDL_SX 0*SIZE(C1, ldc, 1), xvec2;
+LDH_SX 2*SIZE(C1, ldc, 1), xvec2;
+LDL_SX 0*SIZE(C1, ldc, 2), xvec1;
+LDH_SX 2*SIZE(C1, ldc, 2), xvec1;
+LDL_SX 0*SIZE(C1, %rax, 1), xvec0;
+LDH_SX 2*SIZE(C1, %rax, 1), xvec0;
+ADD_SX xvec7, xvec15;
+ADD_SX xvec6, xvec14;
+ADD_SX xvec5, xvec13;
+ADD_SX xvec4, xvec12;
+ADD_SX xvec3, xvec11;
+ADD_SX xvec2, xvec10;
+ADD_SX xvec1, xvec9;
+ADD_SX xvec0, xvec8;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0);
+STL_SX xvec14, 0*SIZE(C0, ldc, 1);
+STH_SX xvec14, 2*SIZE(C0, ldc, 1);
+STL_SX xvec13, 0*SIZE(C0, ldc, 2);
+STH_SX xvec13, 2*SIZE(C0, ldc, 2);
+STL_SX xvec12, 0*SIZE(C0, %rax, 1);
+STH_SX xvec12, 2*SIZE(C0, %rax, 1);
+STL_SX xvec11, 0*SIZE(C1);
+STH_SX xvec11, 2*SIZE(C1);
+STL_SX xvec10, 0*SIZE(C1, ldc, 1);
+STH_SX xvec10, 2*SIZE(C1, ldc, 1);
+STL_SX xvec9, 0*SIZE(C1, ldc, 2);
+STH_SX xvec9, 2*SIZE(C1, ldc, 2);
+STL_SX xvec8, 0*SIZE(C1, %rax, 1);
+STH_SX xvec8, 2*SIZE(C1, %rax, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+.L5_loopE:
+TEST $2, bm;
+JLE .L6_loopE;
+.align 16
+.L6_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb
+#endif
+#### Initial Results Register ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+MOVQ bk, k;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $8, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L11_loopE;
+.align 16
+.L11_bodyB:
+#### Computing kernel
+LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
+SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+
+SHUF_SX $0xee, xvec0, xvec6;
+EDUP_SX 8*SIZE(ptrbb), xvec2;
+ODUP_SX 8*SIZE(ptrbb), xvec3;
+MUL_SX xvec6, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec6, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+ODUP_SX 12*SIZE(ptrbb), xvec5;
+MUL_SX xvec6, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec6, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+SHUF_SX $0x44, xvec0, xvec1;
+EDUP_SX 16*SIZE(ptrbb), xvec2;
+ODUP_SX 16*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 20*SIZE(ptrbb), xvec4;
+ODUP_SX 20*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+
+SHUF_SX $0xee, xvec0, xvec6;
+EDUP_SX 24*SIZE(ptrbb), xvec2;
+ODUP_SX 24*SIZE(ptrbb), xvec3;
+MUL_SX xvec6, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec6, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 28*SIZE(ptrbb), xvec4;
+ODUP_SX 28*SIZE(ptrbb), xvec5;
+MUL_SX xvec6, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec6, xvec5;
+ADD_SX xvec5, xvec12;
+
+ADDQ $8*SIZE, ptrba;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L11_bodyB;
+.align 16
+.L11_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L12_loopE;
+.align 16
+.L12_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
+SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+
+SHUF_SX $0xee, xvec0, xvec6;
+EDUP_SX 8*SIZE(ptrbb), xvec2;
+ODUP_SX 8*SIZE(ptrbb), xvec3;
+MUL_SX xvec6, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec6, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 12*SIZE(ptrbb), xvec4;
+ODUP_SX 12*SIZE(ptrbb), xvec5;
+MUL_SX xvec6, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec6, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $4*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+.L12_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L13_loopE;
+.align 16
+.L13_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4
+SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+EDUP_SX 4*SIZE(ptrbb), xvec4;
+ODUP_SX 4*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $2*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+.L13_loopE:
+LEAQ (ldc,ldc,2),%rax;
+#### Multiply Alpha ####
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec7, xvec12;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec11;
+LDH_SX 0*SIZE(C0, ldc, 2), xvec11;
+LDL_SX 0*SIZE(C0, ldc, 1), xvec10;
+LDH_SX 0*SIZE(C0, %rax, 1), xvec10;
+LDL_SX 0*SIZE(C1), xvec9;
+LDH_SX 0*SIZE(C1, ldc, 2), xvec9;
+LDL_SX 0*SIZE(C1, ldc, 1), xvec8;
+LDH_SX 0*SIZE(C1, %rax, 1), xvec8;
+ADD_SX xvec11, xvec15;
+ADD_SX xvec10, xvec14;
+ADD_SX xvec9, xvec13;
+ADD_SX xvec8, xvec12;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 0*SIZE(C0, ldc, 2);
+STL_SX xvec14, 0*SIZE(C0, ldc, 1);
+STH_SX xvec14, 0*SIZE(C0, %rax, 1);
+STL_SX xvec13, 0*SIZE(C1);
+STH_SX xvec13, 0*SIZE(C1, ldc, 2);
+STL_SX xvec12, 0*SIZE(C1, ldc, 1);
+STH_SX xvec12, 0*SIZE(C1, %rax, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+#### Writing Back ####
+.L6_loopE:
+TEST $1, bm;
+JLE .L7_loopE;
+.align 16
+.L7_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#### intitial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+MOVQ bk, k;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $8, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L14_loopE;
+.align 16
+.L14_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+LD_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+LD_SX 8*SIZE(ptrbb), xvec4;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+LD_SX 12*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+
+BROAD_SX 2*SIZE(ptrba), xvec0;
+LD_SX 16*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+LD_SX 20*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+BROAD_SX 3*SIZE(ptrba), xvec1;
+LD_SX 24*SIZE(ptrbb), xvec4;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+LD_SX 28*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+
+ADDQ $4*SIZE, ptrba;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L14_bodyB;
+.align 16
+.L14_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L15_loopE;
+.align 16
+.L15_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+LD_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+BROAD_SX 1*SIZE(ptrba), xvec1;
+LD_SX 8*SIZE(ptrbb), xvec4;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+LD_SX 12*SIZE(ptrbb), xvec5;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+ADDQ $2*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+.L15_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L16_loopE;
+.align 16
+.L16_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+LD_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+ADDQ $1, ptrba;
+ADDQ $4, ptrbb;
+
+.L16_loopE:
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+
+LEAQ (ldc,ldc,2),%rax;
+SHUF_SX $0xff, xvec15, xvec13;
+SHUF_SX $0xaa, xvec15, xvec12;
+SHUF_SX $0x55, xvec15, xvec11;
+SHUF_SX $0x00, xvec15, xvec10;
+
+#ifndef TRMMKERNEL
+addss 0*SIZE(C0), xvec10;
+addss 0*SIZE(C0, ldc, 1), xvec11;
+addss 0*SIZE(C0, ldc, 2), xvec12;
+addss 0*SIZE(C0, %rax, 1), xvec13;
+#endif
+movss xvec10, 0*SIZE(C0);
+movss xvec11, 0*SIZE(C0, ldc, 1);
+movss xvec12, 0*SIZE(C0, ldc, 2);
+movss xvec13, 0*SIZE(C0, %rax, 1);
+
+SHUF_SX $0xff, xvec14, xvec9;
+SHUF_SX $0xaa, xvec14, xvec8;
+SHUF_SX $0x55, xvec14, xvec7;
+SHUF_SX $0x00, xvec14, xvec6;
+
+#ifndef TRMMKERNEL
+addss 0*SIZE(C1), xvec6;
+addss 0*SIZE(C1, ldc, 1), xvec7;
+addss 0*SIZE(C1, ldc, 2), xvec8;
+addss 0*SIZE(C1, %rax, 1), xvec9;
+#endif
+movss xvec6, 0*SIZE(C1);
+movss xvec7, 0*SIZE(C1, ldc, 1);
+movss xvec8, 0*SIZE(C1, ldc, 2);
+movss xvec9, 0*SIZE(C1, %rax, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 8), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $1, kk
+#endif
+ADDQ $1*SIZE, C0;
+ADDQ $1*SIZE, C1;
+#### Writing Back ####
+.L7_loopE:
+#if defined(TRMMKERNEL)&&!defined(LEFT)
+ADDQ $8, kk
+#endif
+MOVQ bk,k;
+SALQ $5,k;
+ADDQ k,bb;
+LEAQ (C,ldc,8),C;
+.L0_bodyE:;
+DECQ j;
+JG .L0_bodyB;
+.align 16;
+.L0_loopE:;
+TEST $4, bn; # Rn = 4
+JLE .L20_loopE;
+.align 16;
+.L20_bodyB:
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+
+MOVQ C, C0;
+LEAQ (C, ldc, 2), C1;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i;
+JLE .L21_loopE;
+.align 16
+.L21_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+XOR_SY yvec11, yvec11, yvec11;
+XOR_SY yvec10, yvec10, yvec10;
+LD_SX 0*SIZE(ptrba), xvec0;
+XOR_SY yvec9, yvec9, yvec9;
+XOR_SY yvec8, yvec8, yvec8;
+LD_SX 4*SIZE(ptrba), xvec1;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2,k;
+JLE .L211_loopE;
+.align 16
+.L211_bodyB:
+#### Unroll time 1 ####
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+MOV_SX xvec4, xvec6;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+LD_SX 8*SIZE(ptrba), xvec0;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+LD_SX 12*SIZE(ptrba), xvec1;
+
+#### Unroll time 2 ####
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+
+EDUP_SX 8*SIZE(ptrbb), xvec2;
+MOV_SX xvec4, xvec6;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+LD_SX 16*SIZE(ptrba), xvec0;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+LD_SX 20*SIZE(ptrba), xvec1;
+
+#### Unroll time 3 ####
+ODUP_SX 8*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+
+EDUP_SX 12*SIZE(ptrbb), xvec2;
+MOV_SX xvec4, xvec6;
+ADDQ $16*SIZE, ptrbb;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+LD_SX 24*SIZE(ptrba), xvec0;
+
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+LD_SX 28*SIZE(ptrba), xvec1;
+ADDQ $32*SIZE, ptrba;
+
+#### Unroll time 4 ####
+ODUP_SX -4*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+MOV_SX xvec4, xvec6;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+LD_SX 0*SIZE(ptrba), xvec0;
+
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+LD_SX 4*SIZE(ptrba), xvec1;
+DECQ k;
+JG .L211_bodyB;
+.align 16
+.L211_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk
+#else
+TEST $2, kkk;
+#endif
+JLE .L212_loopE;
+.align 16
+.L212_bodyB:
+#### Unroll time 1 ####
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+MOV_SX xvec4, xvec6;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+ADDQ $8*SIZE, ptrbb;
+
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+LD_SX 8*SIZE(ptrba), xvec0;
+
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+LD_SX 12*SIZE(ptrba), xvec1;
+ADDQ $16*SIZE, ptrba;
+
+#### Unroll time 2 ####
+ODUP_SX -4*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+MOV_SX xvec4, xvec6;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+LD_SX 0*SIZE(ptrba), xvec0;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+LD_SX 4*SIZE(ptrba), xvec1;
+
+.L212_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L213_loopE;
+.align 16
+.L213_bodyB:
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+SHUF_SX $0x4e, xvec2, xvec4;
+MOV_SX xvec2, xvec6;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+ADDQ $4*SIZE, ptrbb;
+
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec14;
+MOV_SX xvec3, xvec7;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec13;
+
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec12;
+MOV_SX xvec4, xvec6;
+ADDQ $8*SIZE, ptrba;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec11;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec10;
+
+MOV_SX xvec5, xvec7;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec9;
+MUL_SX xvec1, xvec7;
+ADD_SX xvec7, xvec8;
+
+.L213_loopE:
+#### Multiply Alpha ####
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec7, xvec12;
+MUL_SX xvec7, xvec11;
+MUL_SX xvec7, xvec10;
+MUL_SX xvec7, xvec9;
+MUL_SX xvec7, xvec8;
+#### Writing Back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+LDL_SX 4*SIZE(C0), xvec1;
+LDH_SX 6*SIZE(C1), xvec1;
+LDL_SX 0*SIZE(C0, ldc, 1), xvec2;
+LDH_SX 2*SIZE(C1, ldc, 1), xvec2;
+LDL_SX 4*SIZE(C0, ldc, 1), xvec3;
+LDH_SX 6*SIZE(C1, ldc, 1), xvec3;
+LDL_SX 0*SIZE(C1), xvec4;
+LDH_SX 2*SIZE(C0), xvec4;
+LDL_SX 4*SIZE(C1), xvec5;
+LDH_SX 6*SIZE(C0), xvec5;
+LDL_SX 0*SIZE(C1, ldc, 1), xvec6;
+LDH_SX 2*SIZE(C0, ldc, 1), xvec6;
+LDL_SX 4*SIZE(C1, ldc, 1), xvec7;
+LDH_SX 6*SIZE(C0, ldc, 1), xvec7;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+ADD_SX xvec2, xvec13;
+ADD_SX xvec3, xvec12;
+ADD_SX xvec4, xvec11;
+ADD_SX xvec5, xvec10;
+ADD_SX xvec6, xvec9;
+ADD_SX xvec7, xvec8;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+STL_SX xvec14, 4*SIZE(C0);
+STH_SX xvec14, 6*SIZE(C1);
+STL_SX xvec13, 0*SIZE(C0, ldc, 1);
+STH_SX xvec13, 2*SIZE(C1, ldc, 1);
+STL_SX xvec12, 4*SIZE(C0, ldc, 1);
+STH_SX xvec12, 6*SIZE(C1, ldc, 1);
+STL_SX xvec11, 0*SIZE(C1);
+STH_SX xvec11, 2*SIZE(C0);
+STL_SX xvec10, 4*SIZE(C1);
+STH_SX xvec10, 6*SIZE(C0);
+STL_SX xvec9, 0*SIZE(C1, ldc, 1);
+STH_SX xvec9, 2*SIZE(C0, ldc, 1);
+STL_SX xvec8, 4*SIZE(C1, ldc, 1);
+STH_SX xvec8, 6*SIZE(C0, ldc, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L21_bodyB;
+.align 16
+.L21_loopE:
+TEST $4, bm;
+JLE .L22_loopE;
+.align 16
+.L22_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial Results ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L221_loopE;
+.align 16
+.L221_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec1;
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+EDUP_SX 8*SIZE(ptrbb), xvec2;
+ODUP_SX 8*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 12*SIZE(ptrba), xvec1;
+EDUP_SX 12*SIZE(ptrbb), xvec2;
+ODUP_SX 12*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $16*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+DECQ k;
+JG .L221_bodyB;
+.align 16
+.L221_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L222_loopE;
+.align 16
+.L222_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+
+LD_SX 4*SIZE(ptrba), xvec1;
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec1, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec1, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec13
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+.L222_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L223_loopE;
+.align 16
+.L223_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+EDUP_SX 0*SIZE(ptrbb), xvec2;
+ODUP_SX 0*SIZE(ptrbb), xvec3;
+
+SHUF_SX $0x4e, xvec2, xvec4;
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+SHUF_SX $0x4e, xvec3, xvec5;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec13;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec12;
+ADDQ $4*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+.L223_loopE:
+#### Multiply Alpha ####
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec7, xvec12;
+#### Writing back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+LDL_SX 0*SIZE(C0, ldc, 1), xvec1;
+LDH_SX 2*SIZE(C1, ldc, 1), xvec1;
+LDL_SX 0*SIZE(C1), xvec2;
+LDH_SX 2*SIZE(C0), xvec2;
+LDL_SX 0*SIZE(C1, ldc, 1), xvec3;
+LDH_SX 2*SIZE(C0, ldc, 1), xvec3;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+ADD_SX xvec2, xvec13;
+ADD_SX xvec3, xvec12;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+STL_SX xvec14, 0*SIZE(C0, ldc, 1);
+STH_SX xvec14, 2*SIZE(C1, ldc, 1);
+STL_SX xvec13, 0*SIZE(C1);
+STH_SX xvec13, 2*SIZE(C0);
+STL_SX xvec12, 0*SIZE(C1, ldc, 1);
+STH_SX xvec12, 2*SIZE(C0, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+.L22_loopE:
+TEST $2, bm;
+JLE .L23_loopE;
+.align 16
+.L23_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L231_loopE;
+.align 16
+.L231_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x44, xvec0, xvec1;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+
+SHUF_SX $0xee, xvec0, xvec2;
+EDUP_SX 4*SIZE(ptrbb), xvec6;
+ODUP_SX 4*SIZE(ptrbb), xvec7;
+MUL_SX xvec2, xvec6;
+ADD_SX xvec6, xvec15;
+MUL_SX xvec2, xvec7;
+ADD_SX xvec7, xvec14;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+EDUP_SX 8*SIZE(ptrbb), xvec4;
+ODUP_SX 8*SIZE(ptrbb), xvec5;
+SHUF_SX $0x44, xvec0, xvec1;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+
+SHUF_SX $0xee, xvec0, xvec2;
+EDUP_SX 12*SIZE(ptrbb), xvec6;
+ODUP_SX 12*SIZE(ptrbb), xvec7;
+MUL_SX xvec2, xvec6;
+ADD_SX xvec6, xvec15;
+MUL_SX xvec2, xvec7;
+ADD_SX xvec7, xvec14;
+
+ADDQ $8*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L231_bodyB;
+.align 16
+.L231_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L232_loopE;
+.align 16
+.L232_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x44, xvec0, xvec1;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+
+SHUF_SX $0xee, xvec0, xvec2;
+EDUP_SX 4*SIZE(ptrbb), xvec6;
+ODUP_SX 4*SIZE(ptrbb), xvec7;
+MUL_SX xvec2, xvec6;
+ADD_SX xvec6, xvec15;
+MUL_SX xvec2, xvec7;
+ADD_SX xvec7, xvec14;
+
+ADDQ $4*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+.L232_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L233_loopE;
+.align 16
+.L233_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+EDUP_SX 0*SIZE(ptrbb), xvec4;
+ODUP_SX 0*SIZE(ptrbb), xvec5;
+SHUF_SX $0x44, xvec0, xvec1;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec15;
+MUL_SX xvec1, xvec5;
+ADD_SX xvec5, xvec14;
+
+ADDQ $2*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+.L233_loopE:
+#### Multiply Alpha ####
+BROAD_SY MEMALPHA, yvec7;
+MUL_SY xvec7, xvec15, xvec15;
+MUL_SY xvec7, xvec14, xvec14;
+#### Writing Back ####
+SHUF_SX $0xee, xvec15, xvec13;
+SHUF_SX $0xee, xvec14, xvec12;
+#ifndef TRMMKERNEL
+ADD_SY 0*SIZE(C0), xvec15, xvec15;
+ADD_SY 0*SIZE(C0, ldc, 1), xvec14, xvec14;
+ADD_SY 0*SIZE(C1), xvec13, xvec13;
+ADD_SY 0*SIZE(C1, ldc, 1), xvec12, xvec12;
+#endif
+STL_SY xvec15, 0*SIZE(C0);
+STL_SY xvec14, 0*SIZE(C0, ldc, 1);
+STL_SY xvec13, 0*SIZE(C1);
+STL_SY xvec12, 0*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+.L23_loopE:
+TEST $1, bm;
+JLE .L24_loopE;
+.align 16
+.L24_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L241_loopE;
+.align 16
+.L241_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+BROAD_SX 1*SIZE(ptrba), xvec2;
+LD_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec2, xvec3;
+ADD_SX xvec3, xvec15;
+
+BROAD_SX 2*SIZE(ptrba), xvec4;
+LD_SX 8*SIZE(ptrbb), xvec5;
+MUL_SX xvec4, xvec5;
+ADD_SX xvec5, xvec15;
+
+BROAD_SX 3*SIZE(ptrba), xvec6;
+LD_SX 12*SIZE(ptrbb), xvec7;
+MUL_SX xvec6, xvec7;
+ADD_SX xvec7, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L241_bodyB;
+.align
+.L241_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L242_loopE;
+.align 16
+.L242_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+BROAD_SX 1*SIZE(ptrba), xvec2;
+LD_SX 4*SIZE(ptrbb), xvec3;
+MUL_SX xvec2, xvec3;
+ADD_SX xvec3, xvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L242_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L243_loopE;
+.align 16;
+.L243_bodyB:
+BROAD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+ADDQ $1*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+.L243_loopE:
+#### Multiply Alpha ####
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+SHUF_SX $0xff, xvec15, xvec14;
+SHUF_SX $0xaa, xvec15, xvec13;
+SHUF_SX $0x55, xvec15, xvec12;
+SHUF_SX $0x00, xvec15, xvec11;
+
+#ifndef TRMMKERNEL
+addss 0*SIZE(C0), xvec11;
+addss 0*SIZE(C0, ldc, 1), xvec12;
+addss 0*SIZE(C1), xvec13;
+addss 0*SIZE(C1, ldc, 1), xvec14;
+#endif
+
+movss xvec11, 0*SIZE(C0);
+movss xvec12, 0*SIZE(C0, ldc, 1);
+movss xvec13, 0*SIZE(C1);
+movss xvec14, 0*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $1, kk
+#endif
+ADDQ $1*SIZE, C0;
+ADDQ $1*SIZE, C1;
+.L24_loopE:
+#if defined(TRMMKERNEL)&&!defined(LEFT)
+ADDQ $4, kk
+#endif
+MOVQ bk, k;
+SALQ $4, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 4), C;
+.L20_loopE:
+TEST $2, bn;
+JLE .L30_loopE;
+.align 16
+.L30_bodyB:
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk
+#endif
+MOVQ C, C0;
+LEAQ (C, ldc, 1), C1;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i;
+JLE .L31_loopE;
+.align 16
+.L31_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L311_loopE;
+.align 16
+.L311_bodyB:
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+
+SHUF_SX $0xfa, xvec2, xvec3;
+LD_SX 8*SIZE(ptrba), xvec0;
+LD_SX 12*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+
+LD_SX 4*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+LD_SX 16*SIZE(ptrba), xvec0;
+LD_SX 20*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+
+SHUF_SX $0xfa, xvec2, xvec3;
+LD_SX 24*SIZE(ptrba), xvec0;
+LD_SX 28*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+
+ADDQ $32*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L311_bodyB;
+.align 16
+.L311_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L312_loopE;
+.align 16
+.L312_bodyB:
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+
+SHUF_SX $0xfa, xvec2, xvec3;
+LD_SX 8*SIZE(ptrba), xvec0;
+LD_SX 12*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+ADDQ $16*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L312_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L313_loopE;
+.align 16
+.L313_bodyB:
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrba), xvec1;
+
+MOV_SX xvec3, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+SHUF_SX $0x4e, xvec4, xvec5;
+MUL_SX xvec1, xvec4;
+ADD_SX xvec4, xvec14;
+
+MOV_SX xvec5, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec13;
+MUL_SX xvec1, xvec6;
+ADD_SX xvec6, xvec12;
+ADDQ $8*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L313_loopE:
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+MUL_SX xvec7, xvec13;
+MUL_SX xvec7, xvec12;
+#### Writing Back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+LDL_SX 4*SIZE(C0), xvec1;
+LDH_SX 6*SIZE(C1), xvec1;
+LDL_SX 0*SIZE(C1), xvec2;
+LDH_SX 2*SIZE(C0), xvec2;
+LDL_SX 4*SIZE(C1), xvec3;
+LDH_SX 6*SIZE(C0), xvec3;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+ADD_SX xvec2, xvec13;
+ADD_SX xvec3, xvec12;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+STL_SX xvec14, 4*SIZE(C0);
+STH_SX xvec14, 6*SIZE(C1);
+STL_SX xvec13, 0*SIZE(C1);
+STH_SX xvec13, 2*SIZE(C0);
+STL_SX xvec12, 4*SIZE(C1);
+STH_SX xvec12, 6*SIZE(C0);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $8, kk
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L31_bodyB;
+.align 16
+.L31_loopE:
+TEST $4, bm;
+JLE .L32_loopE;
+.align 16
+.L32_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L321_loopE;
+.align 16
+.L321_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+SHUF_SX $0x05, xvec2, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec14;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+SHUF_SX $0xfa, xvec2, xvec5;
+SHUF_SX $0xaf, xvec2, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec15;
+MUL_SX xvec0, xvec6;
+ADD_SX xvec6, xvec14;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+LD_SX 4*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+SHUF_SX $0x05, xvec2, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec14;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+SHUF_SX $0xfa, xvec2, xvec5;
+SHUF_SX $0xaf, xvec2, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec15;
+MUL_SX xvec0, xvec6;
+ADD_SX xvec6, xvec14;
+
+ADDQ $16*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L321_bodyB;
+.align 16
+.L321_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L322_loopE;
+.align 16
+.L322_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+SHUF_SX $0x05, xvec2, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec14;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+SHUF_SX $0xfa, xvec2, xvec5;
+SHUF_SX $0xaf, xvec2, xvec6;
+MUL_SX xvec0, xvec5;
+ADD_SX xvec5, xvec15;
+MUL_SX xvec0, xvec6;
+ADD_SX xvec6, xvec14;
+ADDQ $8*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L322_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L323_loopE;
+.align 16
+.L323_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+LD_SX 0*SIZE(ptrbb), xvec2;
+SHUF_SX $0x50, xvec2, xvec3;
+SHUF_SX $0x05, xvec2, xvec4;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec15;
+MUL_SX xvec0, xvec4;
+ADD_SX xvec4, xvec14;
+ADDQ $4*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L323_loopE:
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+#### Writing back ####
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C1), xvec0;
+LDL_SX 0*SIZE(C1), xvec1;
+LDH_SX 2*SIZE(C0), xvec1;
+ADD_SX xvec0, xvec15;
+ADD_SX xvec1, xvec14;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C1);
+STL_SX xvec14, 0*SIZE(C1);
+STH_SX xvec14, 2*SIZE(C0);
+#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+.L32_loopE:
+TEST $2, bm;
+JLE .L33_loopE;
+.align 16
+.L33_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+XOR_SY yvec13, yvec13, yvec13;
+XOR_SY yvec12, yvec12, yvec12;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L331_loopE;
+.align 16
+.L331_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3
+EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2
+ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3
+
+MUL_SX xvec0, xvec2; # c00, c10
+ADD_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3; # C01, c11
+ADD_SX xvec3, xvec14;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+EDUP_SX 4*SIZE(ptrbb), xvec2;
+ODUP_SX 4*SIZE(ptrbb), xvec3;
+
+MUL_SX xvec0, xvec2;
+ADD_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3;
+ADD_SX xvec3, xvec14;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L331_bodyB;
+.align 16
+.L331_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L332_loopE;
+.align 16
+.L332_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3
+EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2
+ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3
+
+MUL_SX xvec0, xvec2; # c00, c10
+ADD_SX xvec2, xvec15;
+MUL_SX xvec0, xvec3; # C01, c11
+ADD_SX xvec3, xvec14;
+
+ADDQ $4*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L332_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L333_loopE;
+.align 16
+.L333_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 1*SIZE(ptrba), xvec1;
+movss 0*SIZE(ptrbb), xvec2;
+XOR_SY yvec3, yvec3, yvec3;
+movss xvec2, xvec3;
+mulss xvec0, xvec2;
+addss xvec2, xvec15;
+mulss xvec1, xvec3;
+SHUF_SX $0xe1, xvec3, xvec4;
+ADD_SX xvec4, xvec15;
+
+movss 1*SIZE(ptrbb), xvec5;
+XOR_SY yvec6, yvec6, yvec6;
+movss xvec5, xvec6;
+mulss xvec0, xvec5;
+addss xvec5, xvec14;
+mulss xvec1, xvec6;
+SHUF_SX $0xe1, xvec6, xvec7;
+ADD_SX xvec7, xvec14
+
+ADDQ $2*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+.L333_loopE:
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+MUL_SX xvec7, xvec14;
+SHUF_SX $0xee, xvec15, xvec13;
+SHUF_SX $0xee, xvec14, xvec12;
+SHUF_SX $0x44, xvec15, xvec11;
+SHUF_SX $0x44, xvec14, xvec10;
+ADD_SX xvec13, xvec11;
+ADD_SX xvec12, xvec10;
+
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDL_SX 0*SIZE(C1), xvec1;
+ADD_SX xvec0, xvec11;
+ADD_SX xvec1, xvec10;
+#endif
+STL_SX xvec11, 0*SIZE(C0);
+STL_SX xvec10, 0*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+#### Writing Back ####
+.L33_loopE:
+TEST $1, bm;
+JLE .L34_loopE;
+.align 16
+.L34_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#### Initial ####
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L341_loopE;
+.align 16
+.L341_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 0*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 1*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+
+movss 1*SIZE(ptrba), xvec0;
+movss 2*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 3*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+
+movss 2*SIZE(ptrba), xvec0;
+movss 4*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 5*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+
+movss 3*SIZE(ptrba), xvec0;
+movss 6*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 7*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+
+addq $4*SIZE, ptrba;
+addq $8*SIZE, ptrbb;
+decq k;
+jg .L341_bodyB;
+.align 16
+.L341_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L342_loopE;
+.align 16
+.L342_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 0*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 1*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+
+movss 1*SIZE(ptrba), xvec0;
+movss 2*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 3*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+addq $2*SIZE, ptrba;
+addq $4*SIZE, ptrbb;
+.L342_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L343_loopE;
+.align 16
+.L343_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 0*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 1*SIZE(ptrbb), xvec2;
+mulss xvec0, xvec2;
+addss xvec2, xvec14;
+addq $1*SIZE, ptrba;
+addq $2*SIZE, ptrbb
+
+.L343_loopE:
+#### Writing back ####
+movss MEMALPHA, xvec7;
+mulss xvec7, xvec15;
+mulss xvec7, xvec14;
+movss 0*SIZE(C0), xvec0;
+movss 0*SIZE(C1), xvec1;
+#ifndef TRMMKERNEL
+addss xvec0, xvec15;
+addss xvec1, xvec14;
+#endif
+movss xvec15, 0*SIZE(C0);
+movss xvec14, 0*SIZE(C1);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+addq $1*SIZE, C0;
+addq $1*SIZE, C1;
+.L34_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $2, kk;
+#endif
+MOVQ bk, k;
+SALQ $3, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 2), C;
+.L30_loopE:
+TEST $1, bn;
+JLE .L40_loopE;
+.align 16
+.L40_bodyB:
+#if defined(TRMMKERNEL)&&defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C, C0;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $3, i;
+JLE .L41_loopE;
+.align 16
+.L41_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#### initial ####
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $8, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L411_loopE;
+.align 16
+.L411_bodyB:
+LD_SY 0*SIZE(ptrba), yvec0;
+BROAD_SY 0*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+
+LD_SY 8*SIZE(ptrba), yvec0;
+BROAD_SY 1*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+
+LD_SY 16*SIZE(ptrba), yvec0;
+BROAD_SY 2*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+
+LD_SY 24*SIZE(ptrba), yvec0;
+BROAD_SY 3*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+
+ADDQ $32*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+DECQ k;
+JG .L411_bodyB;
+.align 16
+.L411_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L412_loopE;
+.align 16
+.L412_bodyB:
+LD_SY 0*SIZE(ptrba), yvec0;
+BROAD_SY 0*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+
+LD_SY 8*SIZE(ptrba), yvec0;
+BROAD_SY 1*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+
+ADDQ $16*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+.L412_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L413_loopE;
+.align 16
+.L413_bodyB:
+LD_SY 0*SIZE(ptrba), yvec0;
+BROAD_SY 0*SIZE(ptrbb), yvec1;
+MUL_SY yvec0, yvec1, yvec2;
+ADD_SY yvec2, yvec15, yvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $1*SIZE, ptrbb;
+.L413_loopE:
+#### Writing ####
+BROAD_SY MEMALPHA, yvec7;
+MUL_SY yvec7, yvec15, yvec15;
+EXTRA_SY $1, yvec15, xvec14;
+SHUF_SX $0x44, xvec15, xvec13;
+SHUF_SX $0xee, xvec15, xvec12;
+SHUF_SX $0x44, xvec14, xvec11;
+SHUF_SX $0xee, xvec14, xvec10;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDL_SX 2*SIZE(C0), xvec1;
+LDL_SX 4*SIZE(C0), xvec2;
+LDL_SX 6*SIZE(C0), xvec3;
+ADD_SX xvec0, xvec13;
+ADD_SX xvec1, xvec12;
+ADD_SX xvec2, xvec11;
+ADD_SX xvec3, xvec10;
+#endif
+STL_SX xvec13, 0*SIZE(C0);
+STL_SX xvec12, 2*SIZE(C0);
+STL_SX xvec11, 4*SIZE(C0);
+STL_SX xvec10, 6*SIZE(C0);
+#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 8), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL)&&defined(LEFT)
+ADDQ $8, kk;
+#endif
+ADDQ $8*SIZE, C0;
+DECQ i;
+JG .L41_bodyB;
+.align 16
+.L41_loopE:
+TEST $4, bm;
+JLE .L42_loopE;
+.align 16
+.L42_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk
+#endif
+SARQ $2, k;
+JLE .L421_loopE;
+.align 16
+.L421_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+BROAD_SX 0*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+BROAD_SX 1*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+LD_SX 8*SIZE(ptrba), xvec0;
+BROAD_SX 2*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+LD_SX 12*SIZE(ptrba), xvec0;
+BROAD_SX 3*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+ADDQ $16*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+DECQ k;
+JG .L421_bodyB;
+.align 16
+.L421_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L422_loopE;
+.align 16
+.L422_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+BROAD_SX 0*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+LD_SX 4*SIZE(ptrba), xvec0;
+BROAD_SX 1*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+
+ADDQ $8*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+.L422_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L423_loopE;
+.align 16
+.L423_bodyB:
+LD_SX 0*SIZE(ptrba), xvec0;
+BROAD_SX 0*SIZE(ptrbb), xvec1;
+MUL_SX xvec0, xvec1;
+ADD_SX xvec1, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $1*SIZE, ptrbb;
+
+.L423_loopE:
+#### Writing back ####
+BROAD_SX MEMALPHA, xvec7;
+MUL_SX xvec7, xvec15;
+#ifndef TRMMKERNEL
+LDL_SX 0*SIZE(C0), xvec0;
+LDH_SX 2*SIZE(C0), xvec0;
+ADD_SX xvec0, xvec15;
+#endif
+STL_SX xvec15, 0*SIZE(C0);
+STH_SX xvec15, 2*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk
+#endif
+ADDQ $4*SIZE, C0;
+
+.L42_loopE:
+TEST $2, bm;
+JLE .L43_loopE;
+.align 16
+.L43_bodyB:
+#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax
+LEAQ (, %rax, SIZE), %rax
+LEAQ (ptrba, %rax, 2), ptrba
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+XOR_SY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L431_loopE;
+.align 16
+.L431_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 1*SIZE(ptrba), xvec1;
+movss 0*SIZE(ptrbb), xvec2;
+mulss xvec2, xvec0;
+addss xvec0, xvec15;
+mulss xvec2, xvec1;
+addss xvec1, xvec14;
+
+movss 2*SIZE(ptrba), xvec3;
+movss 3*SIZE(ptrba), xvec4;
+movss 1*SIZE(ptrbb), xvec5;
+mulss xvec5, xvec3;
+addss xvec3, xvec15;
+mulss xvec5, xvec4;
+addss xvec4, xvec14;
+
+movss 4*SIZE(ptrba), xvec0;
+movss 5*SIZE(ptrba), xvec1;
+movss 2*SIZE(ptrbb), xvec2;
+mulss xvec2, xvec0;
+addss xvec0, xvec15;
+mulss xvec2, xvec1;
+addss xvec1, xvec14;
+
+movss 6*SIZE(ptrba), xvec3;
+movss 7*SIZE(ptrba), xvec4;
+movss 3*SIZE(ptrbb), xvec5;
+mulss xvec5, xvec3;
+addss xvec3, xvec15;
+mulss xvec5, xvec4;
+addss xvec4, xvec14;
+addq $8*SIZE, ptrba;
+addq $4*SIZE, ptrbb;
+decq k;
+JG .L431_bodyB;
+.align 16
+.L431_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L432_loopE;
+.align 16
+.L432_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 1*SIZE(ptrba), xvec1;
+movss 0*SIZE(ptrbb), xvec2;
+mulss xvec2, xvec0;
+addss xvec0, xvec15;
+mulss xvec2, xvec1;
+addss xvec1, xvec14;
+
+movss 2*SIZE(ptrba), xvec3;
+movss 3*SIZE(ptrba), xvec4;
+movss 1*SIZE(ptrbb), xvec5;
+mulss xvec5, xvec3;
+addss xvec3, xvec15;
+mulss xvec5, xvec4;
+addss xvec4, xvec14;
+addq $4*SIZE, ptrba;
+addq $2*SIZE, ptrbb;
+
+.L432_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L433_loopE;
+.align 16
+.L433_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 1*SIZE(ptrba), xvec1;
+movss 0*SIZE(ptrbb), xvec2;
+mulss xvec2, xvec0;
+addss xvec0, xvec15;
+mulss xvec2, xvec1;
+addss xvec1, xvec14;
+addq $2*SIZE, ptrba;
+addq $1*SIZE, ptrbb;
+
+.L433_loopE:
+#### Writing Back ####
+movss MEMALPHA, xvec7;
+mulss xvec7, xvec15;
+mulss xvec7, xvec14;
+
+#ifndef TRMMKERNEL
+addss 0*SIZE(C0), xvec15;
+addss 1*SIZE(C0), xvec14;
+#endif
+movss xvec15, 0*SIZE(C0);
+movss xvec14, 1*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+addq $2, kk
+#endif
+addq $2*SIZE, C0;
+
+.L43_loopE:
+TEST $1, bm;
+JLE .L44_loopE;
+.align 16
+.L44_bodyB:
+#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bb, ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+LEAQ (, %rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_SY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk, k;
+#elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L441_loopE;
+.align 16
+.L441_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 0*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 1*SIZE(ptrba), xvec0;
+movss 1*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 2*SIZE(ptrba), xvec0;
+movss 2*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 3*SIZE(ptrba), xvec0;
+movss 3*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+addq $4*SIZE, ptrba;
+addq $4*SIZE, ptrbb;
+decq k;
+JG .L441_bodyB;
+.align 16
+.L441_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L442_loopE;
+.align 16
+.L442_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 0*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+
+movss 1*SIZE(ptrba), xvec0;
+movss 1*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+addq $2*SIZE, ptrba;
+addq $2*SIZE, ptrbb;
+
+.L442_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L443_loopE;
+.align 16
+.L443_bodyB:
+movss 0*SIZE(ptrba), xvec0;
+movss 0*SIZE(ptrbb), xvec1;
+mulss xvec0, xvec1;
+addss xvec1, xvec15;
+addq $1*SIZE, ptrba;
+addq $1*SIZE, ptrbb;
+
+.L443_loopE:
+#### Writing Back ####
+movss MEMALPHA, xvec7;
+mulss xvec7, xvec15;
+#ifndef TRMMKERNEL
+addss 0*SIZE(C0), xvec15;
+#endif
+movss xvec15, 0*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+LEAQ (,%rax, SIZE), %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+addq $1, kk
+#endif
+addq $1*SIZE, C0;
+
+.L44_loopE:
+MOV bk, k;
+SALQ $2, k;
+ADDQ k, bb;
+ADDQ ldc, C;
+.L40_loopE:
+movq 0(%rsp), %rbx;
+movq 8(%rsp), %rbp;
+movq 16(%rsp), %r12;
+movq 24(%rsp), %r13;
+movq 32(%rsp), %r14;
+movq 40(%rsp), %r15;
+#ifdef WINDOWS_ABI
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ movups 64(%rsp), %xmm6
+ movups 80(%rsp), %xmm7
+ movups 96(%rsp), %xmm8
+ movups 112(%rsp), %xmm9
+ movups 128(%rsp), %xmm10
+ movups 144(%rsp), %xmm11
+ movups 160(%rsp), %xmm12
+ movups 176(%rsp), %xmm13
+ movups 192(%rsp), %xmm14
+ movups 208(%rsp), %xmm15
+#endif
+addq $STACKSIZE, %rsp;
+ret
+
+EPILOGUE
--- /dev/null
+/*****************************************************************************
+ Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the ISCAS nor the names of its contributors may
+be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ **********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define old_bm %rdi
+#define old_bn %rsi
+#define old_bk %rdx
+
+#define bm %r13
+#define bn %r14
+#define bk %r15
+
+#define ALPHA %xmm0
+#define ba %rcx
+#define bb %r8
+#define C %r9
+#define ldc %r10
+
+#define i %r11
+#define k %rax
+
+#define ptrba %rdi
+#define ptrbb %rsi
+#define C0 %rbx
+#define C1 %rbp
+
+#define prebb %r12
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 128
+
+#define old_ldc 8+STACKSIZE(%rsp)
+#define old_offset 16+STACKSIZE(%rsp)
+
+#define MEMALPHA_R 48(%rsp)
+#define MEMALPHA_I 56(%rsp)
+#define j 64(%rsp)
+#define OFFSET 72(%rsp)
+#define kk 80(%rsp)
+#define kkk 88(%rsp)
+
+#else
+#define STACKSIZE 512
+
+#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
+#define OLD_A 48 + STACKSIZE(%rsp)
+#define OLD_B 56 + STACKSIZE(%rsp)
+#define OLD_C 64 + STACKSIZE(%rsp)
+#define old_ldc 72 + STACKSIZE(%rsp)
+#define old_offset 80 + STACKSIZE(%rsp)
+
+#define MEMALPHA_R 224(%rsp)
+#define MEMALPHA_I 232(%rsp)
+#define j 240(%rsp)
+#define OFFSET 248(%rsp)
+#define kk 256(%rsp)
+#define kkk 264(%rsp)
+
+#endif
+
+#define PREFETCH0 prefetcht0
+#define PREFETCH1 prefetcht0
+#define PREFETCH2 prefetcht0
+#define PRESIZE 64
+
+#define xvec0 %xmm0
+#define xvec1 %xmm1
+#define xvec2 %xmm2
+#define xvec3 %xmm3
+#define xvec4 %xmm4
+#define xvec5 %xmm5
+#define xvec6 %xmm6
+#define xvec7 %xmm7
+#define xvec8 %xmm8
+#define xvec9 %xmm9
+#define xvec10 %xmm10
+#define xvec11 %xmm11
+#define xvec12 %xmm12
+#define xvec13 %xmm13
+#define xvec14 %xmm14
+#define xvec15 %xmm15
+
+#define yvec0 %ymm0
+#define yvec1 %ymm1
+#define yvec2 %ymm2
+#define yvec3 %ymm3
+#define yvec4 %ymm4
+#define yvec5 %ymm5
+#define yvec6 %ymm6
+#define yvec7 %ymm7
+#define yvec8 %ymm8
+#define yvec9 %ymm9
+#define yvec10 %ymm10
+#define yvec11 %ymm11
+#define yvec12 %ymm12
+#define yvec13 %ymm13
+#define yvec14 %ymm14
+#define yvec15 %ymm15
+
+#define LEAQ leaq
+#define ADDQ addq
+#define MULQ imulq
+#define SARQ sarq
+#define SALQ salq
+#define ANDQ andq
+#define SUBQ subq
+#define DECQ decq
+#define JG jg
+#define JLE jle
+#define TEST testq
+#define OR orq
+#define JNE jne
+#define JMP jmp
+#define NOP
+#define XOR xorpd
+
+#define XOR_SY vxorps
+#define XOR_DY vxorpd
+#define XOR_SX xorps
+#define XOR_DX xorpd
+
+#define LD_SY vmovaps
+#define LD_DY vmovapd
+#define LD_SX movaps
+#define LD_DX movapd
+#define LDL_DY vmovlpd
+#define LDL_DX movlpd
+#define LDH_DY vmovhpd
+#define LDH_DX movhpd
+
+#define ST_SY vmovaps
+#define ST_DY vmovapd
+#define ST_SX movaps
+#define ST_DX movapd
+#define STL_DY vmovlpd
+#define STL_DX movlpd
+#define STH_DY vmovhpd
+#define STH_DX movhpd
+
+#define EDUP_SY vmovsldup
+#define ODUP_SY vmovshdup
+#define EDUP_SX movsldup
+#define ODUP_SX movshdup
+#define EDUP_DY vmovddup
+
+#define ADD_SY vaddps
+#define ADD_DY vaddpd
+#define ADD_SX addps
+#define ADD_DX addpd
+#define SUB_DY vsubpd
+#define SUB_DX subpd
+
+#define ADDSUB_DY vaddsubpd
+#define ADDSUB_DX addsubpd
+#define ADDSUB_SY vaddsubps
+
+#define MUL_SY vmulps
+#define MUL_DY vmulpd
+#define MUL_SX mulps
+#define MUL_DX mulpd
+
+#define SHUF_SY vperm2f128
+#define SHUF_DY vperm2f128
+#define SHUF_DX pshufd
+#define SHUF_SX pshufd
+
+#define VPERMILP_SY vpermilps
+#define VPERMILP_SX vpermilps
+#define VPERMILP_DY vpermilpd
+
+#define BROAD_SY vbroadcastss
+#define BROAD_DY vbroadcastsd
+#define BROAD_SX vbroadcastss
+#define BROAD_DX movddup
+
+#define MOV_SY vmovaps
+#define MOV_DY vmovapd
+#define MOV_SX movaps
+#define MOV_DX movapd
+
+#define REVS_SY vshufps
+#define REVS_DY vshufpd
+#define REVS_SX shufps
+#define REVS_DX movsd
+
+#define EXTRA_DY vextractf128
+
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define ADD1_DX ADD_DX
+#define ADD1_DY ADD_DY
+#define ADD2_DY ADDSUB_DY
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define ADD1_DX SUB_DX
+#define ADD1_DY SUB_DY
+#define ADD2_DY ADDSUB_DY
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define ADD1_DX SUB_DX
+#define ADD1_DY SUB_DY
+#define ADD2_DY ADDSUB_DY
+#else
+#define ADD1_DX ADD_DX
+#define ADD1_DY ADD_DY
+#define ADD2_DY ADDSUB_DY
+#endif
+
+PROLOGUE
+
+subq $STACKSIZE, %rsp;
+movq %rbx, 0(%rsp);
+movq %rbp, 8(%rsp);
+movq %r12, 16(%rsp);
+movq %r13, 24(%rsp);
+movq %r14, 32(%rsp);
+movq %r15, 40(%rsp);
+
+#ifdef WINDOWS_ABI
+ movq %rdi, 48(%rsp)
+ movq %rsi, 56(%rsp)
+ movups %xmm6, 64(%rsp)
+ movups %xmm7, 80(%rsp)
+ movups %xmm8, 96(%rsp)
+ movups %xmm9, 112(%rsp)
+ movups %xmm10, 128(%rsp)
+ movups %xmm11, 144(%rsp)
+ movups %xmm12, 160(%rsp)
+ movups %xmm13, 176(%rsp)
+ movups %xmm14, 192(%rsp)
+ movups %xmm15, 208(%rsp)
+
+ movq ARG1, old_bm
+ movq ARG2, old_bn
+ movq ARG3, old_bk
+ movq OLD_A, ba
+ movq OLD_B, bb
+ movq OLD_C, C
+ movq old_ldc, ldc
+#ifdef TRMMKERNEL
+ movq old_offset, %r11
+#endif
+ movaps %xmm3, %xmm0
+ movsd OLD_ALPHA_I, %xmm1
+#else
+
+movq old_ldc, ldc
+#ifdef TRMMKERNEL
+movq old_offset, %r11;
+#endif
+#endif
+
+vmovlps %xmm0, MEMALPHA_R
+vmovlps %xmm1, MEMALPHA_I
+movq old_bm, bm
+movq old_bn, bn
+movq old_bk, bk
+salq $ZBASE_SHIFT, ldc
+#ifdef TRMMKERNEL
+movq %r11, OFFSET
+#ifndef LEFT
+negq %r11;
+#endif
+movq %r11, kk;
+#endif
+
+MOVQ bn,j;
+SARQ $2,j; # Rn = 4
+JLE .L0_loopE;
+.align 32;
+.L0_bodyB:;
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C,C0;
+LEAQ (C,ldc,2),C1;
+MOVQ bk, k;
+SALQ $6, k;
+LEAQ (bb, k, 1), prebb; # Rn=4 SIZE=8 COMPLEX=2
+MOVQ ba,ptrba;
+MOVQ bm,i;
+SARQ $2,i; # Rm = 4
+JLE .L1_loopE;
+.align 32;
+.L1_bodyB:;
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+
+PREFETCH0 0*SIZE(prebb);
+PREFETCH0 8*SIZE(prebb);
+PREFETCH0 16*SIZE(prebb)
+ADDQ $24*SIZE, prebb;
+# Initial Results Register
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+EDUP_DY 0*SIZE(ptrbb), yvec2; # Br1, Br1, Br2, Br2
+XOR_DY yvec13, yvec13, yvec13;
+XOR_DY yvec12, yvec12, yvec12;
+EDUP_DY 4*SIZE(ptrbb), yvec3; # Br3, Br3, Br4, Br4
+PREFETCH2 3*SIZE(C0);
+PREFETCH2 3*SIZE(C1);
+XOR_DY yvec11, yvec11, yvec11;
+XOR_DY yvec10, yvec10, yvec10;
+LD_DY 0*SIZE(ptrba), yvec0; # Ar1, Ai1, Ar2, Ai2
+PREFETCH2 7*SIZE(C0, ldc, 1);
+PREFETCH2 7*SIZE(C1, ldc, 1);
+XOR_DY yvec9, yvec9, yvec9;
+XOR_DY yvec8, yvec8, yvec8;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2,k; # Unroll 4 times
+JLE .L2_loopE;
+.align 32;
+.L2_bodyB:;
+#### Computing kernel ####
+
+#### Unroll time 1 ####
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+PREFETCH0 PRESIZE*SIZE(ptrba);
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 12*SIZE(ptrbb), yvec3;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 8*SIZE(ptrba), yvec0;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+
+#### Unroll time 2 ####
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+PREFETCH0 (PRESIZE+8)*SIZE(ptrba);
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 16*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 20*SIZE(ptrbb), yvec3;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 16*SIZE(ptrba), yvec0;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+
+#### Unroll time 3 ####
+LD_DY 20*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+PREFETCH0 (PRESIZE+16)*SIZE(ptrba);
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 17*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 21*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 24*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 28*SIZE(ptrbb), yvec3;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 24*SIZE(ptrba), yvec0;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+
+#### Unroll time 4 ####
+LD_DY 28*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADDQ $32*SIZE, ptrba;
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+PREFETCH0 (PRESIZE+24)*SIZE(ptrba);
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 25*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 29*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADDQ $32*SIZE, ptrbb;
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 0*SIZE(ptrba), yvec0;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+DECQ k;
+JG .L2_bodyB;
+.align 64;
+.L2_loopE:;
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L3_loopE;
+.align 64
+.L3_bodyB:
+#### Unroll time 1 ####
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Br2, Br2, Br1, Br1
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+PREFETCH0 PRESIZE*SIZE(ptrba);
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 12*SIZE(ptrbb), yvec3;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 8*SIZE(ptrba), yvec0;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+
+#### Unroll time 2 ####
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADDQ $16*SIZE, ptrba
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+PREFETCH0 (PRESIZE+8)*SIZE(ptrba);
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 9*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 13*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADDQ $16*SIZE, ptrbb
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+LD_DY 0*SIZE(ptrba), yvec0;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+.L3_loopE:;
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L4_loopE;
+.align 64
+.L4_loopB:;
+#### Unroll time 1 ####
+PREFETCH0 PRESIZE*SIZE(ptrba);
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec0, yvec2, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Br4, Br4, Br3, Br3
+ADDQ $8*SIZE, ptrba;
+ADD1_DY yvec6, yvec15, yvec15;
+ADD1_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+EDUP_DY 1*SIZE(ptrbb), yvec2; # Bi1, Bi1, Bi2, Bi2
+MUL_DY yvec1, yvec3, yvec7;
+EDUP_DY 5*SIZE(ptrbb), yvec3; # Bi3, Bi3, Bi4, Bi4
+ADD1_DY yvec6, yvec14, yvec14;
+ADD1_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+VPERMILP_DY $0x05, yvec0, yvec0; # Ai1, Ar1, Ai2, Ar2
+ADDQ $8*SIZE, ptrbb;
+ADD1_DY yvec6, yvec13, yvec13;
+ADD1_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+SHUF_DY $0x03, yvec2, yvec2, yvec4; # Bi2, Bi2, Bi1, Bi1
+MUL_DY yvec1, yvec5, yvec7;
+SHUF_DY $0x03, yvec3, yvec3, yvec5; # Bi4, Bi4, Bi3, Bi3
+ADD1_DY yvec6, yvec12, yvec12;
+ADD1_DY yvec7, yvec8, yvec8;
+
+VPERMILP_DY $0x05, yvec1, yvec1; # Ai3, Ar3, Ai4, Ar4
+MUL_DY yvec0, yvec2, yvec6;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec6, yvec15, yvec15;
+ADD2_DY yvec7, yvec11, yvec11;
+
+MUL_DY yvec1, yvec2, yvec6;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec6, yvec14, yvec14;
+ADD2_DY yvec7, yvec10, yvec10;
+
+MUL_DY yvec0, yvec4, yvec6;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec6, yvec13, yvec13;
+ADD2_DY yvec7, yvec9, yvec9;
+
+MUL_DY yvec1, yvec4, yvec6;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec6, yvec12, yvec12;
+ADD2_DY yvec7, yvec8, yvec8;
+.L4_loopE:;
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+ADDSUB_DY yvec13, yvec7, yvec13;
+ADDSUB_DY yvec12, yvec7, yvec12;
+ADDSUB_DY yvec11, yvec7, yvec11;
+ADDSUB_DY yvec10, yvec7, yvec10;
+ADDSUB_DY yvec9, yvec7, yvec9;
+ADDSUB_DY yvec8, yvec7, yvec8;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+SUB_DY yvec14, yvec7, yvec14;
+SUB_DY yvec13, yvec7, yvec13;
+SUB_DY yvec12, yvec7, yvec12;
+SUB_DY yvec11, yvec7, yvec11;
+SUB_DY yvec10, yvec7, yvec10;
+SUB_DY yvec9, yvec7, yvec9;
+SUB_DY yvec8, yvec7, yvec8;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec11, yvec11;
+VPERMILP_DY $0x05, yvec10, yvec10;
+VPERMILP_DY $0x05, yvec9, yvec9;
+VPERMILP_DY $0x05, yvec8, yvec8;
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+ADDSUB_DY yvec13, yvec7, yvec13;
+ADDSUB_DY yvec12, yvec7, yvec12;
+ADDSUB_DY yvec11, yvec7, yvec11;
+ADDSUB_DY yvec10, yvec7, yvec10;
+ADDSUB_DY yvec9, yvec7, yvec9;
+ADDSUB_DY yvec8, yvec7, yvec8;
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec11, yvec11;
+VPERMILP_DY $0x05, yvec10, yvec10;
+VPERMILP_DY $0x05, yvec9, yvec9;
+VPERMILP_DY $0x05, yvec8, yvec8;
+#endif
+#### Load Alpha ####
+BROAD_DY MEMALPHA_R,yvec7;
+BROAD_DY MEMALPHA_I,yvec6;
+#### Multiply Alpha ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADDSUB_DY yvec5, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec4;
+MUL_DY yvec7, yvec14, yvec14;
+MUL_DY yvec6, yvec4, yvec4;
+ADDSUB_DY yvec4, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec3;
+MUL_DY yvec7, yvec13, yvec13;
+MUL_DY yvec6, yvec3, yvec3;
+ADDSUB_DY yvec3, yvec13, yvec13;
+VPERMILP_DY $0x05,yvec12, yvec2;
+MUL_DY yvec7, yvec12, yvec12;
+MUL_DY yvec6, yvec2, yvec2;
+ADDSUB_DY yvec2, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec11, yvec1;
+MUL_DY yvec7, yvec11, yvec11;
+MUL_DY yvec6, yvec1, yvec1;
+ADDSUB_DY yvec1, yvec11, yvec11;
+VPERMILP_DY $0x05,yvec10, yvec0;
+MUL_DY yvec7, yvec10, yvec10;
+MUL_DY yvec6, yvec0, yvec0;
+ADDSUB_DY yvec0, yvec10, yvec10;
+VPERMILP_DY $0x05, yvec9, yvec5;
+MUL_DY yvec7, yvec9, yvec9;
+MUL_DY yvec6, yvec5, yvec5;
+ADDSUB_DY yvec5, yvec9, yvec9;
+VPERMILP_DY $0x05, yvec8, yvec4;
+MUL_DY yvec7, yvec8, yvec8;
+MUL_DY yvec6, yvec4, yvec4;
+ADDSUB_DY yvec4, yvec8, yvec8;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L4_loopEx;
+.align 32
+#### Store Back ####
+EXTRA_DY $1,yvec15,xvec7;
+EXTRA_DY $1,yvec14,xvec6;
+EXTRA_DY $1,yvec13,xvec5;
+EXTRA_DY $1,yvec12,xvec4;
+EXTRA_DY $1,yvec11,xvec3;
+EXTRA_DY $1,yvec10,xvec2;
+EXTRA_DY $1,yvec9,xvec1;
+EXTRA_DY $1,yvec8,xvec0;
+#ifndef TRMMKERNEL
+ADD_DY 0*SIZE(C0),xvec15, xvec15;
+ADD_DY 2*SIZE(C0,ldc,1), xvec7, xvec7;
+ADD_DY 4*SIZE(C0),xvec14, xvec14;
+ADD_DY 6*SIZE(C0,ldc,1),xvec6, xvec6;
+ADD_DY 0*SIZE(C0,ldc,1),xvec13, xvec13;
+ADD_DY 2*SIZE(C0),xvec5, xvec5;
+ADD_DY 4*SIZE(C0,ldc,1),xvec12, xvec12;
+ADD_DY 6*SIZE(C0),xvec4, xvec4;
+ADD_DY 0*SIZE(C1),xvec11, xvec11;
+ADD_DY 2*SIZE(C1,ldc,1),xvec3, xvec3;
+ADD_DY 4*SIZE(C1),xvec10, xvec10;
+ADD_DY 6*SIZE(C1,ldc,1),xvec2, xvec2;
+ADD_DY 0*SIZE(C1,ldc,1),xvec9, xvec9;
+ADD_DY 2*SIZE(C1),xvec1, xvec1;
+ADD_DY 4*SIZE(C1,ldc,1),xvec8, xvec8;
+ADD_DY 6*SIZE(C1),xvec0, xvec0;
+#endif
+ST_DY xvec15,0*SIZE(C0);
+ST_DY xvec7,2*SIZE(C0,ldc,1);
+ST_DY xvec14,4*SIZE(C0);
+ST_DY xvec6,6*SIZE(C0,ldc,1);
+ST_DY xvec13,0*SIZE(C0,ldc,1);
+ST_DY xvec5,2*SIZE(C0);
+ST_DY xvec12,4*SIZE(C0,ldc,1);
+ST_DY xvec4,6*SIZE(C0);
+ST_DY xvec11,0*SIZE(C1);
+ST_DY xvec3,2*SIZE(C1,ldc,1);
+ST_DY xvec10,4*SIZE(C1);
+ST_DY xvec2,6*SIZE(C1,ldc,1);
+ST_DY xvec9,0*SIZE(C1,ldc,1);
+ST_DY xvec1,2*SIZE(C1);
+ST_DY xvec8,4*SIZE(C1,ldc,1);
+ST_DY xvec0,6*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+ADDQ $8*SIZE,C0;
+ADDQ $8*SIZE,C1;
+.L1_bodyE:;
+DECQ i;
+JG .L1_bodyB;
+JMP .L1_loopE;
+.align 32
+.L4_loopEx:
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec14, xvec6;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C0), xvec0, xvec0;
+LDH_DY 1*SIZE(C0), xvec0, xvec0;
+LDL_DY 2*SIZE(C0, ldc, 1), xvec1, xvec1;
+LDH_DY 3*SIZE(C0, ldc, 1), xvec1, xvec1;
+LDL_DY 4*SIZE(C0), xvec2, xvec2;
+LDH_DY 5*SIZE(C0), xvec2, xvec2;
+LDL_DY 6*SIZE(C0, ldc, 1), xvec3, xvec3;
+LDH_DY 7*SIZE(C0, ldc, 1), xvec3, xvec3;
+ADD_DY xvec0, xvec15, xvec15;
+ADD_DY xvec1, xvec7, xvec7;
+ADD_DY xvec2, xvec14, xvec14;
+ADD_DY xvec3, xvec6, xvec6;
+#endif
+STL_DY xvec15, 0*SIZE(C0);
+STH_DY xvec15, 1*SIZE(C0);
+STL_DY xvec7, 2*SIZE(C0, ldc, 1);
+STH_DY xvec7, 3*SIZE(C0, ldc, 1);
+STL_DY xvec14, 4*SIZE(C0);
+STH_DY xvec14, 5*SIZE(C0);
+STL_DY xvec6, 6*SIZE(C0, ldc, 1);
+STH_DY xvec6, 7*SIZE(C0, ldc, 1);
+EXTRA_DY $1, yvec13, xvec5;
+EXTRA_DY $1, yvec12, xvec4;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C0, ldc, 1), xvec3, xvec3;
+LDH_DY 1*SIZE(C0, ldc, 1), xvec3, xvec3;
+LDL_DY 2*SIZE(C0), xvec2, xvec2;
+LDH_DY 3*SIZE(C0), xvec2, xvec2;
+LDL_DY 4*SIZE(C0, ldc, 1), xvec1, xvec1;
+LDH_DY 5*SIZE(C0, ldc, 1), xvec1, xvec1;
+LDL_DY 6*SIZE(C0), xvec0, xvec0;
+LDH_DY 7*SIZE(C0), xvec0, xvec0;
+ADD_DY xvec3, xvec13, xvec13;
+ADD_DY xvec2, xvec5, xvec5;
+ADD_DY xvec1, xvec12, xvec12;
+ADD_DY xvec0, xvec4, xvec4;
+#endif
+STL_DY xvec13, 0*SIZE(C0, ldc, 1);
+STH_DY xvec13, 1*SIZE(C0, ldc, 1);
+STL_DY xvec5, 2*SIZE(C0);
+STH_DY xvec5, 3*SIZE(C0);
+STL_DY xvec12, 4*SIZE(C0, ldc, 1);
+STH_DY xvec12, 5*SIZE(C0, ldc, 1);
+STL_DY xvec4, 6*SIZE(C0);
+STH_DY xvec4, 7*SIZE(C0);
+EXTRA_DY $1, yvec11, xvec3;
+EXTRA_DY $1, yvec10, xvec2;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C1), xvec7, xvec7;
+LDH_DY 1*SIZE(C1), xvec7, xvec7;
+LDL_DY 2*SIZE(C1, ldc, 1), xvec6, xvec6;
+LDH_DY 3*SIZE(C1, ldc, 1), xvec6, xvec6;
+LDL_DY 4*SIZE(C1), xvec5, xvec5;
+LDH_DY 5*SIZE(C1), xvec5, xvec5;
+LDL_DY 6*SIZE(C1, ldc, 1), xvec4, xvec4;
+LDH_DY 7*SIZE(C1, ldc, 1), xvec4, xvec4;
+ADD_DY xvec7, xvec11, xvec11;
+ADD_DY xvec6, xvec3, xvec3;
+ADD_DY xvec5, xvec10, xvec10;
+ADD_DY xvec4, xvec2, xvec2;
+#endif
+STL_DY xvec11, 0*SIZE(C1);
+STH_DY xvec11, 1*SIZE(C1);
+STL_DY xvec3, 2*SIZE(C1, ldc, 1);
+STH_DY xvec3, 3*SIZE(C1, ldc, 1);
+STL_DY xvec10, 4*SIZE(C1);
+STH_DY xvec10, 5*SIZE(C1);
+STL_DY xvec2, 6*SIZE(C1, ldc, 1);
+STH_DY xvec2, 7*SIZE(C1, ldc, 1);
+EXTRA_DY $1, yvec9, xvec1;
+EXTRA_DY $1, yvec8, xvec0;
+#ifndef TRMMKERNEL
+LDL_DY 0*SIZE(C1, ldc, 1), xvec5, xvec5;
+LDH_DY 1*SIZE(C1, ldc, 1), xvec5, xvec5;
+LDL_DY 2*SIZE(C1), xvec4, xvec4;
+LDH_DY 3*SIZE(C1), xvec4, xvec4;
+LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3;
+LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3;
+LDL_DY 6*SIZE(C1), xvec2, xvec2;
+LDH_DY 7*SIZE(C1), xvec2, xvec2;
+ADD_DY xvec5, xvec9, xvec9;
+ADD_DY xvec4, xvec1, xvec1;
+ADD_DY xvec3, xvec8, xvec8;
+ADD_DY xvec2, xvec0, xvec0;
+#endif
+STL_DY xvec9, 0*SIZE(C1, ldc, 1);
+STH_DY xvec9, 1*SIZE(C1, ldc, 1);
+STL_DY xvec1, 2*SIZE(C1);
+STH_DY xvec1, 3*SIZE(C1);
+STL_DY xvec8, 4*SIZE(C1, ldc, 1);
+STH_DY xvec8, 5*SIZE(C1, ldc, 1);
+STL_DY xvec0, 6*SIZE(C1);
+STH_DY xvec0, 7*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L1_bodyB;
+.align 32;
+.L1_loopE:;
+TEST $2, bm;
+JLE .L5_loopE;
+.align 32
+.L5_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+XOR_DY yvec13, yvec13, yvec13;
+XOR_DY yvec12, yvec12, yvec12;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L7_loopE;
+.align 32
+.L7_bodyB:
+#### Compute kernel ####
+#### Unroll times 1 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 5*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 2 ####
+LD_DY 4*SIZE(ptrba), yvec0;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+EDUP_DY 12*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 9*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 13*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 3 ####
+LD_DY 8*SIZE(ptrba), yvec0;
+EDUP_DY 16*SIZE(ptrbb), yvec2;
+EDUP_DY 20*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 17*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 21*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 4 ####
+LD_DY 12*SIZE(ptrba), yvec0;
+EDUP_DY 24*SIZE(ptrbb), yvec2;
+EDUP_DY 28*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 25*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 29*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+ADDQ $16*SIZE, ptrba;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L7_bodyB;
+.align 32
+.L7_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L8_loopE;
+.align 32
+.L8_bodyB:
+#### Unroll times 1 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 5*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 2 ####
+LD_DY 4*SIZE(ptrba), yvec0;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+EDUP_DY 12*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 9*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 13*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+ADDQ $8*SIZE, ptrba;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+ADDQ $16*SIZE, ptrbb;
+.L8_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L9_loopE;
+.align 32
+.L9_bodyB:
+#### Unroll times 1 ####
+LD_DY 0*SIZE(ptrba), yvec0;
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec5, yvec7;
+ADD1_DY yvec7 ,yvec12, yvec12;
+EDUP_DY 5*SIZE(ptrbb), yvec3
+
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec4, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec0, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+ADDQ $4*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L9_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+ADDSUB_DY yvec13, yvec7, yvec13;
+ADDSUB_DY yvec12, yvec7, yvec12;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+SUB_DY yvec14, yvec7, yvec14;
+SUB_DY yvec13, yvec7, yvec13;
+SUB_DY yvec12, yvec7, yvec12;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec12, yvec12;
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+ADDSUB_DY yvec13, yvec7, yvec13;
+ADDSUB_DY yvec12, yvec7, yvec12;
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec12, yvec12;
+#endif
+#### Load Alpha ####
+BROAD_DY MEMALPHA_R, yvec7;
+BROAD_DY MEMALPHA_I, yvec6;
+#### Multiply Alpha ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec4;
+MUL_DY yvec7, yvec14, yvec14;
+MUL_DY yvec6, yvec4, yvec4;
+ADD2_DY yvec4, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec3;
+MUL_DY yvec7, yvec13, yvec13;
+MUL_DY yvec6, yvec3, yvec3;
+ADD2_DY yvec3, yvec13, yvec13;
+VPERMILP_DY $0x05,yvec12, yvec2;
+MUL_DY yvec7, yvec12, yvec12;
+MUL_DY yvec6, yvec2, yvec2;
+ADD2_DY yvec2, yvec12, yvec12;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L9_loopEx;
+.align 32
+#### Writing back ####
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec14, xvec6;
+EXTRA_DY $1, yvec13, xvec5;
+EXTRA_DY $1, yvec12, xvec4;
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0), xvec15;
+ADD_DX 2*SIZE(C0, ldc, 1), xvec7;
+ADD_DX 0*SIZE(C0, ldc, 1), xvec13;
+ADD_DX 2*SIZE(C0), xvec5;
+ADD_DX 0*SIZE(C1), xvec14;
+ADD_DX 2*SIZE(C1, ldc, 1), xvec6;
+ADD_DX 0*SIZE(C1, ldc, 1), xvec12;
+ADD_DX 2*SIZE(C1), xvec4;
+#endif
+ST_DX xvec15, 0*SIZE(C0);
+ST_DX xvec7, 2*SIZE(C0, ldc, 1);
+ST_DX xvec13, 0*SIZE(C0, ldc, 1);
+ST_DX xvec5, 2*SIZE(C0);
+ST_DX xvec14, 0*SIZE(C1);
+ST_DX xvec6, 2*SIZE(C1, ldc, 1);
+ST_DX xvec12, 0*SIZE(C1, ldc, 1);
+ST_DX xvec4, 2*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+JMP .L5_loopE;
+.align 32
+.L9_loopEx:
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec14, xvec6;
+EXTRA_DY $1, yvec13, xvec5;
+EXTRA_DY $2, yvec12, xvec4;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C0, ldc, 1), xvec1;
+LDH_DX 3*SIZE(C0, ldc, 1), xvec1;
+LDL_DX 0*SIZE(C0, ldc, 1), xvec2;
+LDH_DX 1*SIZE(C0, ldc, 1), xvec2;
+LDL_DX 2*SIZE(C0), xvec3;
+LDH_DX 3*SIZE(C0), xvec3;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+ADD_DX xvec2, xvec13;
+ADD_DX xvec3, xvec5;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 2*SIZE(C0, ldc, 1);
+STH_DX xvec7, 3*SIZE(C0, ldc, 1);
+STL_DX xvec13, 0*SIZE(C0, ldc, 1);
+STH_DX xvec13, 1*SIZE(C0, ldc, 1);
+STL_DX xvec6, 2*SIZE(C0);
+STH_DX xvec6, 3*SIZE(C0);
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C1), xvec0;
+LDH_DX 1*SIZE(C1), xvec0;
+LDL_DX 2*SIZE(C1, ldc, 1), xvec1;
+LDH_DX 3*SIZE(C1, ldc, 1), xvec1;
+LDL_DX 0*SIZE(C1, ldc, 1), xvec2;
+LDH_DX 1*SIZE(C1, ldc, 1), xvec2;
+LDL_DX 2*SIZE(C1), xvec3;
+LDH_DX 3*SIZE(C1), xvec3;
+ADD_DX xvec0, xvec14;
+ADD_DX xvec1, xvec6;
+ADD_DX xvec2, xvec12;
+ADD_DX xvec3, xvec4;
+#endif
+STL_DX xvec14, 0*SIZE(C1);
+STH_DX xvec14, 1*SIZE(C1);
+STL_DX xvec6, 2*SIZE(C1, ldc, 1);
+STH_DX xvec6, 3*SIZE(C1, ldc, 1);
+STL_DX xvec12, 0*SIZE(C1, ldc, 1);
+STH_DX xvec12, 1*SIZE(C1, ldc, 1);
+STL_DX xvec4, 2*SIZE(C1);
+STH_DX xvec4, 3*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+.L5_loopE:
+TEST $1, bm;
+JLE .L6_loopE;
+.align 32
+.L6_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $4, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L10_loopE;
+.align 32
+.L10_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+SHUF_DY $0x31, yvec0, yvec0, yvec1;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+EDUP_DY 12*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 9*SIZE(ptrbb), yvec2;
+EDUP_DY 13*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+LD_DY 4*SIZE(ptrba), yvec0;
+EDUP_DY 16*SIZE(ptrbb), yvec2;
+EDUP_DY 20*SIZE(ptrbb), yvec3;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 17*SIZE(ptrbb), yvec2;
+EDUP_DY 21*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+SHUF_DY $0x31, yvec0, yvec0, yvec1;
+EDUP_DY 24*SIZE(ptrbb), yvec2;
+EDUP_DY 28*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 25*SIZE(ptrbb), yvec2;
+EDUP_DY 29*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14
+ADDQ $8*SIZE, ptrba;
+ADDQ $32*SIZE, ptrbb;
+DECQ k;
+JG .L10_bodyB;
+.align 32
+.L10_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L11_loopE;
+.align 32
+.L11_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+SHUF_DY $0x31, yvec0, yvec0, yvec1;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+EDUP_DY 12*SIZE(ptrbb), yvec3;
+
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 9*SIZE(ptrbb), yvec2;
+EDUP_DY 13*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+ADDQ $4*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+
+.L11_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L12_loopE;
+.align 32
+.L12_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+EDUP_DY 4*SIZE(ptrbb), yvec3;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+MUL_DY yvec1, yvec3, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+MUL_DY yvec4, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+ADDQ $2*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L12_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+SUB_DY yvec14, yvec7, yvec14;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+#endif
+#### Multiply Alpha ####
+BROAD_DY MEMALPHA_R, yvec7;
+BROAD_DY MEMALPHA_I, yvec6;
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec4;
+MUL_DY yvec7, yvec14, yvec14;
+MUL_DY yvec6, yvec4, yvec4;
+ADD2_DY yvec4, yvec14, yvec14;
+#### Writing Back ####
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec14, xvec6;
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 0*SIZE(C0, ldc, 1), xvec1;
+LDH_DX 1*SIZE(C0, ldc, 1), xvec1;
+LDL_DX 0*SIZE(C1), xvec2;
+LDH_DX 1*SIZE(C1), xvec2;
+LDL_DX 0*SIZE(C1, ldc, 1), xvec3;
+LDH_DX 1*SIZE(C1, ldc, 1), xvec3;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+ADD_DX xvec2, xvec14;
+ADD_DX xvec3, xvec6;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 0*SIZE(C0, ldc, 1);
+STH_DX xvec7, 1*SIZE(C0, ldc, 1);
+STL_DX xvec14, 0*SIZE(C1);
+STH_DX xvec14, 1*SIZE(C1);
+STL_DX xvec6, 0*SIZE(C1, ldc, 1);
+STH_DX xvec6, 1*SIZE(C1, ldc, 1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 4), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C1;
+.L6_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $4, kk;
+#endif
+MOVQ bk,k;
+SALQ $6,k;
+ADDQ k,bb;
+LEAQ (C,ldc,4),C;
+.L0_bodyE:;
+DECQ j;
+JG .L0_bodyB;
+.align 32;
+.L0_loopE:;
+TEST $2, bn;
+JLE .L20_loopE;
+.align 32
+.L20_bodyB:
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ C, C0;
+LEAQ (C, ldc, 1), C1;
+MOVQ ba, ptrba;
+MOVQ bm, i;
+SARQ $2, i;
+JLE .L21_loopE;
+.align 32
+.L21_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+XOR_DY yvec13, yvec13, yvec13;
+XOR_DY yvec12, yvec12, yvec12;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L211_loopE;
+.align 32
+.L211_bodyB:
+#### Unroll time 1 ####
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+LD_DY 0*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 2 ####
+EDUP_DY 4*SIZE(ptrbb), yvec2;
+LD_DY 8*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 3 ####
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+LD_DY 16*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 20*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 9*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 4 ####
+EDUP_DY 12*SIZE(ptrbb), yvec2;
+LD_DY 24*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 28*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 13*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+ADDQ $16*SIZE, ptrbb;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+ADDQ $32*SIZE, ptrba;
+DECQ k;
+JG .L211_bodyB;
+.align 32
+.L211_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L212_loopE;
+.align 32
+.L212_bodyB:
+#### Unroll time 1 ####
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+LD_DY 0*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+#### Unroll time 2 ####
+EDUP_DY 4*SIZE(ptrbb), yvec2;
+LD_DY 8*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+
+ADDQ $8*SIZE, ptrbb;
+ADDQ $16*SIZE, ptrba;
+
+.L212_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L213_loopE;
+.align 32
+.L213_bodyB:
+#### Unroll time 1 ####
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+LD_DY 0*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+EDUP_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+MUL_DY yvec1, yvec4, yvec7;
+ADD1_DY yvec7, yvec12, yvec12;
+VPERMILP_DY $0x05, yvec1, yvec1;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+MUL_DY yvec1, yvec5, yvec7;
+ADD2_DY yvec7, yvec12, yvec12;
+ADDQ $4*SIZE, ptrbb;
+ADDQ $8*SIZE, ptrba;
+
+.L213_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+ADDSUB_DY yvec13, yvec7, yvec13;
+ADDSUB_DY yvec12, yvec7, yvec12;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+SUB_DY yvec14, yvec7, yvec14;
+SUB_DY yvec13, yvec7, yvec13;
+SUB_DY yvec12, yvec7, yvec12;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec12, yvec12;
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+ADDSUB_DY yvec13, yvec7, yvec13;
+ADDSUB_DY yvec12, yvec7, yvec12;
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec12, yvec12;
+#endif
+#### Load Alpha ####
+BROAD_DY MEMALPHA_R,yvec7;
+BROAD_DY MEMALPHA_I,yvec6;
+#### Multiply Alpha ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec4;
+MUL_DY yvec7, yvec14, yvec14;
+MUL_DY yvec6, yvec4, yvec4;
+ADD2_DY yvec4, yvec14, yvec14;
+VPERMILP_DY $0x05, yvec13, yvec3;
+MUL_DY yvec7, yvec13, yvec13;
+MUL_DY yvec6, yvec3, yvec3;
+ADD2_DY yvec3, yvec13, yvec13;
+VPERMILP_DY $0x05,yvec12, yvec2;
+MUL_DY yvec7, yvec12, yvec12;
+MUL_DY yvec6, yvec2, yvec2;
+ADD2_DY yvec2, yvec12, yvec12;
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec14, xvec6;
+EXTRA_DY $1, yvec13, xvec5;
+EXTRA_DY $1, yvec12, xvec4;
+#### Testing Alignment ####
+MOVQ C0, %rax;
+OR ldc, %rax;
+TEST $15, %rax;
+JNE .L213_loopEx;
+.align 32
+#### Writing back ####
+#ifndef TRMMKERNEL
+ADD_DX 0*SIZE(C0),xvec15;
+ADD_DX 2*SIZE(C1),xvec7;
+ADD_DX 4*SIZE(C0),xvec14;
+ADD_DX 6*SIZE(C1),xvec6;
+ADD_DX 0*SIZE(C1),xvec13;
+ADD_DX 2*SIZE(C0),xvec5;
+ADD_DX 4*SIZE(C1),xvec12;
+ADD_DX 6*SIZE(C0),xvec4;
+#endif
+ST_DX xvec15,0*SIZE(C0);
+ST_DX xvec7,2*SIZE(C1);
+ST_DX xvec14,4*SIZE(C0);
+ST_DX xvec6,6*SIZE(C1);
+ST_DX xvec13,0*SIZE(C1);
+ST_DX xvec5,2*SIZE(C0);
+ST_DX xvec12,4*SIZE(C1);
+ST_DX xvec4,6*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L21_bodyB;
+JMP .L21_loopE;
+.align 32
+.L213_loopEx:
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C1), xvec1;
+LDH_DX 3*SIZE(C1), xvec1;
+LDL_DX 4*SIZE(C0), xvec2;
+LDH_DX 5*SIZE(C0), xvec2;
+LDL_DX 6*SIZE(C1), xvec3;
+LDH_DX 7*SIZE(C1), xvec3;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+ADD_DX xvec2, xvec14;
+ADD_DX xvec3, xvec6;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 2*SIZE(C1);
+STH_DX xvec7, 3*SIZE(C1);
+STL_DX xvec14, 4*SIZE(C0);
+STH_DX xvec14, 5*SIZE(C0);
+STL_DX xvec6, 6*SIZE(C1);
+STH_DX xvec6, 7*SIZE(C1);
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C1), xvec3;
+LDH_DX 1*SIZE(C1), xvec3;
+LDL_DX 2*SIZE(C0), xvec2;
+LDH_DX 3*SIZE(C0), xvec2;
+LDL_DX 4*SIZE(C1), xvec1;
+LDH_DX 5*SIZE(C1), xvec1;
+LDL_DX 6*SIZE(C0), xvec0;
+LDH_DX 7*SIZE(C0), xvec0;
+ADD_DX xvec3, xvec13;
+ADD_DX xvec2, xvec5;
+ADD_DX xvec1, xvec12;
+ADD_DX xvec0, xvec4;
+#endif
+STL_DX xvec13, 0*SIZE(C1);
+STH_DX xvec13, 1*SIZE(C1);
+STL_DX xvec5, 2*SIZE(C0);
+STH_DX xvec5, 3*SIZE(C0);
+STL_DX xvec12, 4*SIZE(C1);
+STH_DX xvec12, 5*SIZE(C1);
+STL_DX xvec4, 6*SIZE(C0);
+STH_DX xvec4, 7*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+ADDQ $8*SIZE, C0;
+ADDQ $8*SIZE, C1;
+DECQ i;
+JG .L21_bodyB;
+.align 32
+.L21_loopE:
+TEST $2, bm;
+JLE .L22_loopE;
+.align 32
+.L22_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec13;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L221_loopE;
+.align 32
+.L221_bodyB:
+#### Unroll time 1 ####
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+LD_DY 0*SIZE(ptrba), yvec0;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+EDUP_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+
+#### Unroll time 2 ####
+EDUP_DY 4*SIZE(ptrbb), yvec2;
+LD_DY 4*SIZE(ptrba), yvec0;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+
+#### Unroll time 3 ####
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+LD_DY 8*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+
+EDUP_DY 9*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+
+#### Unroll time 4 ####
+EDUP_DY 12*SIZE(ptrbb), yvec2;
+LD_DY 12*SIZE(ptrba), yvec0;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+
+EDUP_DY 13*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+ADDQ $16*SIZE, ptrbb;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+ADDQ $16*SIZE, ptrba;
+DECQ k;
+JG .L221_bodyB;
+.align 32
+.L221_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L222_loopE;
+.align 32
+.L222_bodyB:
+#### Unroll time 1 ####
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+LD_DY 0*SIZE(ptrba), yvec0;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+EDUP_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+
+#### Unroll time 2 ####
+EDUP_DY 4*SIZE(ptrbb), yvec2;
+LD_DY 4*SIZE(ptrba), yvec0;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+EDUP_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L222_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L223_loopE;
+.align 32
+.L223_bodyB:
+#### Unroll time 1 ####
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+LD_DY 0*SIZE(ptrba), yvec0;
+SHUF_DY $0x03, yvec2, yvec2, yvec4;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+EDUP_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec0, yvec4, yvec6;
+ADD1_DY yvec6, yvec13, yvec13;
+VPERMILP_DY $0x05, yvec0, yvec0;
+
+MUL_DY yvec0, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+SHUF_DY $0x03, yvec3, yvec3, yvec5;
+
+MUL_DY yvec0, yvec5, yvec6;
+ADD2_DY yvec6, yvec13, yvec13;
+ADDQ $4*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L223_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec13, yvec7, yvec13;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+SUB_DY yvec13, yvec7, yvec13;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec13, yvec13;
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec13, yvec7, yvec13;
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec13, yvec13;
+#endif
+
+#### Load Alpha ####
+BROAD_DY MEMALPHA_R,yvec7;
+BROAD_DY MEMALPHA_I,yvec6;
+#### Multiply Alpha ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec13, yvec3;
+MUL_DY yvec7, yvec13, yvec13;
+MUL_DY yvec6, yvec3, yvec3;
+ADD2_DY yvec3, yvec13, yvec13;
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec13, xvec5;
+#### Write back ####
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C1), xvec1;
+LDH_DX 3*SIZE(C1), xvec1;
+LDL_DX 0*SIZE(C1), xvec2;
+LDH_DX 1*SIZE(C1), xvec2;
+LDL_DX 2*SIZE(C0), xvec3;
+LDH_DX 3*SIZE(C0), xvec3;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+ADD_DX xvec2, xvec13;
+ADD_DX xvec3, xvec5;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 2*SIZE(C1);
+STH_DX xvec7, 3*SIZE(C1);
+STL_DX xvec13, 0*SIZE(C1);
+STH_DX xvec13, 1*SIZE(C1);
+STL_DX xvec5, 2*SIZE(C0);
+STH_DX xvec5, 3*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $4*SIZE, C0;
+ADDQ $4*SIZE, C1;
+
+.L22_loopE:
+TEST $1, bm;
+JLE .L23_loopE;
+.align 32
+.L23_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $2, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L231_loopE;
+.align 32
+.L231_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+SHUF_DY $0x31, yvec0, yvec0, yvec1;
+EDUP_DY 4*SIZE(ptrbb), yvec2;
+
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 5*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec0;
+EDUP_DY 8*SIZE(ptrbb), yvec2;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 9*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+SHUF_DY $0x31, yvec0, yvec0, yvec1;
+EDUP_DY 12*SIZE(ptrbb), yvec2;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 13*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $16*SIZE, ptrbb;
+DECQ k;
+JG .L231_bodyB;
+.align 32
+.L231_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L232_loopE;
+.align 32
+.L232_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+SHUF_DY $0x31, yvec0, yvec0, yvec1;
+EDUP_DY 4*SIZE(ptrbb), yvec2;
+
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 5*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+
+.L232_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L233_loopE;
+.align 32
+.L233_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i
+EDUP_DY 0*SIZE(ptrbb), yvec2;
+
+SHUF_DY $0x20, yvec0, yvec0, yvec1;
+MUL_DY yvec1, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec4;
+EDUP_DY 1*SIZE(ptrbb), yvec2;
+MUL_DY yvec4, yvec2, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L233_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+ADDSUB_DY yvec15, yvec7, yvec15;
+VPERMILP_DY $0x05, yvec15, yvec15;
+#endif
+
+#### Multiply Alpha ####
+BROAD_DY MEMALPHA_R, yvec7;
+BROAD_DY MEMALPHA_I, yvec6;
+#### Writing Back ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+EXTRA_DY $1, yvec15, xvec7;
+#### Writing Back ####
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 0*SIZE(C1), xvec1;
+LDH_DX 1*SIZE(C1), xvec1;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 0*SIZE(C1);
+STH_DX xvec7, 1*SIZE(C1);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+LEAQ (ptrbb, %rax, 2), ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+ADDQ $2*SIZE, C0;
+ADDQ $2*SIZE, C0;
+.L23_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $2, kk;
+#endif
+MOVQ bk, k;
+SALQ $5, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 2), C;
+.L20_loopE:
+TEST $1, bn;
+JLE .L30_loopE;
+.align 32
+.L30_bodyB:
+#if defined(TRMMKERNEL) && defined(LEFT)
+MOVQ OFFSET, %rax;
+MOVQ %rax, kk;
+#endif
+MOVQ ba, ptrba;
+MOVQ C, C0;
+MOVQ bm, i;
+SARQ $2, i;
+JLE .L31_loopE;
+.align 32
+.L31_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+XOR_DY yvec14, yvec14, yvec14;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $4, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L311_loopE;
+.align 32
+.L311_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+LD_DY 8*SIZE(ptrba), yvec0;
+BROAD_DY 2*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 3*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+LD_DY 16*SIZE(ptrba), yvec0;
+BROAD_DY 4*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 20*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+LD_DY 24*SIZE(ptrba), yvec0;
+BROAD_DY 6*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 28*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 7*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+ADDQ $32*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L311_bodyB;
+.align 32
+.L311_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L312_loopE;
+.align 32
+.L312_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+
+LD_DY 8*SIZE(ptrba), yvec0;
+BROAD_DY 2*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 12*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 3*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+ADDQ $16*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L312_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L313_loopE;
+.align 32
+.L313_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec1;
+MUL_DY yvec1, yvec2, yvec7;
+ADD1_DY yvec7, yvec14, yvec14;
+
+VPERMILP_DY $0x05, yvec0, yvec4;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec4, yvec3, yvec6;
+ADD2_DY yvec6, yvec15, yvec15;
+
+VPERMILP_DY $0x05, yvec1, yvec5;
+MUL_DY yvec5, yvec3, yvec7;
+ADD2_DY yvec7, yvec14, yvec14;
+ADDQ $8*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L313_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+SUB_DY yvec14, yvec7, yvec14;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+ADDSUB_DY yvec15, yvec7, yvec15;
+ADDSUB_DY yvec14, yvec7, yvec14;
+VPERMILP_DY $0x05, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec14;
+#endif
+
+#### Load Alpha ####
+BROAD_DY MEMALPHA_R,yvec7;
+BROAD_DY MEMALPHA_I,yvec6;
+#### Multiply Alpha ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec14, yvec4;
+MUL_DY yvec7, yvec14, yvec14;
+MUL_DY yvec6, yvec4, yvec4;
+ADD2_DY yvec4, yvec14, yvec14;
+EXTRA_DY $1, yvec15, xvec7;
+EXTRA_DY $1, yvec14, xvec6;
+#### Writing Back ####
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C0), xvec1;
+LDH_DX 3*SIZE(C0), xvec1;
+LDL_DX 4*SIZE(C0), xvec2;
+LDH_DX 5*SIZE(C0), xvec2;
+LDL_DX 6*SIZE(C0), xvec3;
+LDH_DX 7*SIZE(C0), xvec3;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+ADD_DX xvec2, xvec14;
+ADD_DX xvec3, xvec6;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 2*SIZE(C0);
+STH_DX xvec7, 3*SIZE(C0);
+STL_DX xvec14, 4*SIZE(C0);
+STH_DX xvec14, 5*SIZE(C0);
+STL_DX xvec6, 6*SIZE(C0);
+STH_DX xvec6, 7*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 4), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $4, kk;
+#endif
+ADDQ $8*SIZE, C0;
+DECQ i;
+JG .L31_bodyB;
+.align 32
+.L31_loopE:
+TEST $2, bm;
+JLE .L32_loopE;
+.align 32
+.L32_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $2, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L321_loopE;
+.align 32
+.L321_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec0;
+BROAD_DY 2*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 3*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+
+LD_DY 8*SIZE(ptrba), yvec0;
+BROAD_DY 4*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 5*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+
+LD_DY 12*SIZE(ptrba), yvec0;
+BROAD_DY 6*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 7*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+ADDQ $16*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L321_bodyB;
+.align 32
+.L321_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L322_loopE;
+.align 32
+.L322_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+
+LD_DY 4*SIZE(ptrba), yvec0;
+BROAD_DY 2*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 3*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L322_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L323_loopE;
+.align 32
+.L323_bodyB:
+LD_DY 0*SIZE(ptrba), yvec0;
+BROAD_DY 0*SIZE(ptrbb), yvec2;
+MUL_DY yvec0, yvec2, yvec6;
+ADD1_DY yvec6, yvec15, yvec15;
+VPERMILP_DY $0x05, yvec0, yvec1;
+BROAD_DY 1*SIZE(ptrbb), yvec3;
+MUL_DY yvec1, yvec3, yvec7;
+ADD2_DY yvec7, yvec15, yvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+.L323_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DY yvec15, yvec7, yvec15;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DY yvec15, yvec7, yvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+VPERMILP_DY $0x05, yvec15, yvec15;
+ADDSUB_DY yvec15, yvec7, yvec15;
+VPERMILP_DY $0x05, yvec15, yvec15;
+#endif
+
+#### Load Alpha ####
+BROAD_DY MEMALPHA_R,yvec7;
+BROAD_DY MEMALPHA_I,yvec6;
+#### Multiply Alpha ####
+VPERMILP_DY $0x05, yvec15, yvec5;
+MUL_DY yvec7, yvec15, yvec15;
+MUL_DY yvec6, yvec5, yvec5;
+ADD2_DY yvec5, yvec15, yvec15;
+EXTRA_DY $1, yvec15, xvec7;
+#### Writing Back ####
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+LDL_DX 2*SIZE(C0), xvec1;
+LDH_DX 3*SIZE(C0), xvec1;
+ADD_DX xvec0, xvec15;
+ADD_DX xvec1, xvec7;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+STL_DX xvec7, 2*SIZE(C0);
+STH_DX xvec7, 3*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+LEAQ (ptrba, %rax, 2), ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $2, kk;
+#endif
+ADDQ $4*SIZE, C0;
+.L32_loopE:
+TEST $1, bm;
+JLE .L33_loopE;
+.align 32
+.L33_bodyB:
+#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA))
+MOVQ bb,ptrbb;
+#else
+MOVQ bb, ptrbb;
+MOVQ kk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+XOR_DY yvec15, yvec15, yvec15;
+#ifndef TRMMKERNEL
+MOVQ bk,k;
+#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kk, %rax;
+MOVQ %rax, kkk;
+#else
+MOVQ kk, %rax;
+#ifdef LEFT
+ADDQ $1, %rax;
+#else
+ADDQ $1, %rax;
+#endif
+MOVQ %rax, kkk;
+#endif
+SARQ $2, k;
+JLE .L331_loopE;
+.align 32
+.L331_bodyB:
+LD_DX 0*SIZE(ptrba), xvec0;
+BROAD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 1*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec0;
+BROAD_DX 2*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 3*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+
+LD_DX 4*SIZE(ptrba), xvec0;
+BROAD_DX 4*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 5*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+
+LD_DX 6*SIZE(ptrba), xvec0;
+BROAD_DX 6*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 7*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+ADDQ $8*SIZE, ptrba;
+ADDQ $8*SIZE, ptrbb;
+DECQ k;
+JG .L331_bodyB;
+.align 32
+.L331_loopE:
+#ifndef TRMMKERNEL
+TEST $2, bk;
+#else
+TEST $2, kkk;
+#endif
+JLE .L332_loopE;
+.align 32
+.L332_bodyB:
+LD_DX 0*SIZE(ptrba), xvec0;
+BROAD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 1*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+
+LD_DX 2*SIZE(ptrba), xvec0;
+BROAD_DX 2*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 3*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+ADDQ $4*SIZE, ptrba;
+ADDQ $4*SIZE, ptrbb;
+
+.L332_loopE:
+#ifndef TRMMKERNEL
+TEST $1, bk;
+#else
+TEST $1, kkk;
+#endif
+JLE .L333_loopE;
+.align 32
+.L333_bodyB:
+LD_DX 0*SIZE(ptrba), xvec0;
+BROAD_DX 0*SIZE(ptrbb), xvec2;
+MUL_DX xvec0, xvec2;
+ADD1_DX xvec2, xvec15;
+
+SHUF_DX $0x4e, xvec0, xvec1;
+BROAD_DX 1*SIZE(ptrbb), xvec3;
+MUL_DX xvec1, xvec3;
+ADDSUB_DX xvec3, xvec15;
+ADDQ $2*SIZE, ptrba;
+ADDQ $2*SIZE, ptrbb;
+
+.L333_loopE:
+#### Handle ####
+XOR_DY yvec7, yvec7, yvec7;
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ADDSUB_DX xvec15, xvec7;
+MOV_DX xvec7, xvec15;
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+SUB_DX xvec15, xvec7;
+MOV_DX xvec7, xvec15;
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+SHUF_DX $0x4e, xvec15, xvec15;
+ADDSUB_DX xvec15, xvec7;
+MOV_DX xvec7, xvec15;
+SHUF_DX $0x4e, xvec15, xvec15;
+#endif
+
+#### Load Alpha ####
+BROAD_DX MEMALPHA_R,xvec7;
+BROAD_DX MEMALPHA_I,xvec6;
+#### Multiply Alpha ####
+SHUF_DX $0x4e, xvec15, xvec5;
+MUL_DX xvec7, xvec15;
+MUL_DX xvec6, xvec5;
+ADDSUB_DX xvec5, xvec15;
+#### Writing back ####
+#ifndef TRMMKERNEL
+LDL_DX 0*SIZE(C0), xvec0;
+LDH_DX 1*SIZE(C0), xvec0;
+ADD_DX xvec0, xvec15;
+#endif
+STL_DX xvec15, 0*SIZE(C0);
+STH_DX xvec15, 1*SIZE(C0);
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+MOVQ bk, %rax;
+SUBQ kkk, %rax;
+SALQ $ZBASE_SHIFT, %rax;
+ADDQ %rax, ptrba;
+ADDQ %rax, ptrbb;
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+ADDQ $1, kk;
+#endif
+ADDQ $2*SIZE, C0;
+.L33_loopE:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ADDQ $1, kk;
+#endif
+MOVQ bk, k;
+SALQ $4*SIZE, k;
+ADDQ k, bb;
+LEAQ (C, ldc, 1), C;
+.L30_loopE:
+movq 0(%rsp), %rbx;
+movq 8(%rsp), %rbp;
+movq 16(%rsp), %r12;
+movq 24(%rsp), %r13;
+movq 32(%rsp), %r14;
+movq 40(%rsp), %r15;
+
+#ifdef WINDOWS_ABI
+ movq 48(%rsp), %rdi
+ movq 56(%rsp), %rsi
+ movups 64(%rsp), %xmm6
+ movups 80(%rsp), %xmm7
+ movups 96(%rsp), %xmm8
+ movups 112(%rsp), %xmm9
+ movups 128(%rsp), %xmm10
+ movups 144(%rsp), %xmm11
+ movups 160(%rsp), %xmm12
+ movups 176(%rsp), %xmm13
+ movups 192(%rsp), %xmm14
+ movups 208(%rsp), %xmm15
+#endif
+
+
+addq $STACKSIZE, %rsp;
+ret
+
+EPILOGUE
/*****************************************************************************
-Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
#endif
-#define SGEMM_DEFAULT_R sgemm_r
-#define QGEMM_DEFAULT_R qgemm_r
-#define DGEMM_DEFAULT_R dgemm_r
-#define CGEMM_DEFAULT_R cgemm_r
-#define ZGEMM_DEFAULT_R zgemm_r
-#define XGEMM_DEFAULT_R xgemm_r
-
-#define SYMV_P 16
-#define HAVE_EXCLUSIVE_CACHE
-
-#define GEMM_THREAD gemm_thread_mn
-
-#endif
-
-#if defined(BOBCAT)
-
-#define SNUMOPT 8
-#define DNUMOPT 4
-
-#define GEMM_DEFAULT_OFFSET_A 64
-#define GEMM_DEFAULT_OFFSET_B 832
-#define GEMM_DEFAULT_ALIGN 0x0fffUL
-
-#define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_N 4
-#define QGEMM_DEFAULT_UNROLL_N 2
-#define CGEMM_DEFAULT_UNROLL_N 2
-#define ZGEMM_DEFAULT_UNROLL_N 2
-#define XGEMM_DEFAULT_UNROLL_N 1
-
-#ifdef ARCH_X86
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define DGEMM_DEFAULT_UNROLL_M 2
-#define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_M 1
-#define XGEMM_DEFAULT_UNROLL_M 1
-#else
-#define SGEMM_DEFAULT_UNROLL_M 8
-#define DGEMM_DEFAULT_UNROLL_M 4
-#define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 4
-#define ZGEMM_DEFAULT_UNROLL_M 2
-#define XGEMM_DEFAULT_UNROLL_M 1
-#endif
-
-
-#define SGEMM_DEFAULT_P 448
-#define DGEMM_DEFAULT_P 224
-#define QGEMM_DEFAULT_P 112
-#define CGEMM_DEFAULT_P 224
-#define ZGEMM_DEFAULT_P 112
-#define XGEMM_DEFAULT_P 56
-
-#define SGEMM_DEFAULT_Q 224
-#define DGEMM_DEFAULT_Q 224
-#define QGEMM_DEFAULT_Q 224
-#define CGEMM_DEFAULT_Q 224
-#define ZGEMM_DEFAULT_Q 224
-#define XGEMM_DEFAULT_Q 224
-
-
#define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define SNUMOPT 8
#define DNUMOPT 4
-#define GEMM_DEFAULT_OFFSET_A 32
+#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
-#define DGEMM_DEFAULT_UNROLL_M 2
+#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#else
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define DGEMM_DEFAULT_UNROLL_M 2
+#define SGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_M 1
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 8
-#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1
#endif
-#define SGEMM_DEFAULT_P 504
+#define SGEMM_DEFAULT_P 512
#define SGEMM_DEFAULT_R sgemm_r
+//#define SGEMM_DEFAULT_R 1024
-#define DGEMM_DEFAULT_P 504
+#define DGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_R dgemm_r
+//#define DGEMM_DEFAULT_R 1024
#define QGEMM_DEFAULT_P 504
#define QGEMM_DEFAULT_R qgemm_r
-#define CGEMM_DEFAULT_P 252
-#define CGEMM_DEFAULT_R cgemm_r
+#define CGEMM_DEFAULT_P 128
+//#define CGEMM_DEFAULT_R cgemm_r
+#define CGEMM_DEFAULT_R 1024
-#define ZGEMM_DEFAULT_P 252
+#define ZGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_R zgemm_r
+//#define ZGEMM_DEFAULT_R 1024
#define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r
-#define SGEMM_DEFAULT_Q 512
+#define SGEMM_DEFAULT_Q 256
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 512
-#define ZGEMM_DEFAULT_Q 256
+#define CGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128
#define GETRF_FACTOR 0.72