ZSWAPKERNEL = zswap.c
#
-#SGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVNKERNEL = sgemv_n.c
DGEMVNKERNEL = dgemv_n.c
-#CGEMVNKERNEL = ../arm/zgemv_n.c
+CGEMVNKERNEL = cgemv_n.c
ZGEMVNKERNEL = zgemv_n_4.c
#
-#SGEMVTKERNEL = ../arm/gemv_t.c
+SGEMVTKERNEL = sgemv_t.c
DGEMVTKERNEL = dgemv_t.c
-#CGEMVTKERNEL = ../arm/zgemv_t.c
+CGEMVTKERNEL = cgemv_t.c
ZGEMVTKERNEL = zgemv_t_4.c
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+\r
+#include <stdlib.h>\r
+#include <stdio.h>\r
+#include "common.h" \r
+#include <altivec.h> \r
+#define NBMAX 1024\r
+\r
+\r
+static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};\r
+\r
+ \r
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {\r
+ \r
+ FLOAT *a0, *a1, *a2, *a3;\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ a2 = a1 + lda;\r
+ a3 = a2 + lda;\r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ register __vector float vx0_r = {x[0], x[0],x[0], x[0]};\r
+ register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};\r
+ register __vector float vx1_r = {x[2], x[2],x[2], x[2]};\r
+ register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};\r
+ register __vector float vx2_r = {x[4], x[4],x[4], x[4]};\r
+ register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};\r
+ register __vector float vx3_r = {x[6], x[6],x[6], x[6]};\r
+ register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};\r
+#else\r
+ register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};\r
+ register __vector float vx0_i = {x[1], x[1],x[1], x[1]};\r
+ register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};\r
+ register __vector float vx1_i = {x[3], x[3],x[3], x[3]};\r
+ register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};\r
+ register __vector float vx2_i = {x[5], x[5],x[5], x[5]};\r
+ register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};\r
+ register __vector float vx3_i = {x[7], x[7],x[7], x[7]};\r
+#endif\r
+ register __vector float *vy = (__vector float *) y;\r
+ register __vector float *vptr_a0 = (__vector float *) a0;\r
+ register __vector float *vptr_a1 = (__vector float *) a1;\r
+ register __vector float *vptr_a2 = (__vector float *) a2;\r
+ register __vector float *vptr_a3 = (__vector float *) a3; \r
+ BLASLONG i = 0; \r
+ for (;i< n / 2; i+=2) {\r
+ register __vector float vy_0 = vy[i];\r
+ register __vector float vy_1 = vy[i + 1];\r
+ register __vector float va0 = vptr_a0[i];\r
+ register __vector float va1 = vptr_a1[i];\r
+ register __vector float va2 = vptr_a2[i];\r
+ register __vector float va3 = vptr_a3[i];\r
+ register __vector float va0_1 = vptr_a0[i + 1];\r
+ register __vector float va1_1 = vptr_a1[i + 1];\r
+ register __vector float va2_1 = vptr_a2[i + 1];\r
+ register __vector float va3_1 = vptr_a3[i + 1];\r
+\r
+ vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;\r
+ vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;\r
+ va0 = vec_perm(va0, va0,swap_mask);\r
+ va0_1 = vec_perm(va0_1, va0_1,swap_mask);\r
+ va1 = vec_perm(va1, va1,swap_mask);\r
+ va1_1 = vec_perm(va1_1, va1_1,swap_mask);\r
+ va2 = vec_perm(va2, va2,swap_mask);\r
+ va2_1 = vec_perm(va2_1, va2_1,swap_mask);\r
+ va3 = vec_perm(va3, va3,swap_mask);\r
+ va3_1 = vec_perm(va3_1, va3_1,swap_mask);\r
+ vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;\r
+ vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;\r
+\r
+ vy[i] = vy_0;\r
+ vy[i + 1] = vy_1;\r
+ }\r
+\r
+} \r
+ \r
+\r
+\r
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {\r
+ \r
+ FLOAT *a0, *a1;\r
+ a0 = ap;\r
+ a1 = ap + lda; \r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ register __vector float vx0_r = {x[0], x[0],x[0], x[0]};\r
+ register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};\r
+ register __vector float vx1_r = {x[2], x[2],x[2], x[2]};\r
+ register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; \r
+#else\r
+ register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};\r
+ register __vector float vx0_i = {x[1], x[1],x[1], x[1]};\r
+ register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};\r
+ register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; \r
+#endif\r
+ register __vector float *vy = (__vector float *) y;\r
+ register __vector float *vptr_a0 = (__vector float *) a0;\r
+ register __vector float *vptr_a1 = (__vector float *) a1; \r
+ BLASLONG i = 0; \r
+ for (;i< n / 2; i+=2) {\r
+ register __vector float vy_0 = vy[i];\r
+ register __vector float vy_1 = vy[i + 1];\r
+ register __vector float va0 = vptr_a0[i];\r
+ register __vector float va1 = vptr_a1[i]; \r
+ register __vector float va0_1 = vptr_a0[i + 1];\r
+ register __vector float va1_1 = vptr_a1[i + 1]; \r
+ register __vector float va0x = vec_perm(va0, va0,swap_mask);\r
+ register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);\r
+ register __vector float va1x = vec_perm(va1, va1,swap_mask);\r
+ register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);\r
+ vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;\r
+ vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; \r
+\r
+ vy[i] = vy_0;\r
+ vy[i + 1] = vy_1;\r
+ }\r
+\r
+}\r
+\r
+ \r
+\r
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {\r
+\r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ register __vector float vx0_r = {x[0], x[0],x[0], x[0]};\r
+ register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; \r
+#else\r
+ register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};\r
+ register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; \r
+#endif\r
+ register __vector float *vy = (__vector float *) y;\r
+ register __vector float *vptr_a0 = (__vector float *) ap; \r
+ BLASLONG i = 0; \r
+ for (;i< n / 2; i+=2) {\r
+ register __vector float vy_0 = vy[i];\r
+ register __vector float vy_1 = vy[i + 1];\r
+ register __vector float va0 = vptr_a0[i];\r
+ register __vector float va0_1 = vptr_a0[i + 1]; \r
+ register __vector float va0x = vec_perm(va0, va0,swap_mask);\r
+ register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);\r
+ vy_0 += va0*vx0_r + va0x*vx0_i;\r
+ vy_1 += va0_1*vx0_r + va0x_1*vx0_i; \r
+\r
+ vy[i] = vy_0;\r
+ vy[i + 1] = vy_1;\r
+ }\r
+}\r
+\r
+\r
+\r
+\r
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {\r
+ BLASLONG i;\r
+\r
+\r
+ if (inc_dest != 2) {\r
+ FLOAT temp_r;\r
+ FLOAT temp_i;\r
+ for ( i=0; i<n; i++ )\r
+ {\r
+#if !defined(XCONJ) \r
+ temp_r = alpha_r * src[0] - alpha_i * src[1];\r
+ temp_i = alpha_r * src[1] + alpha_i * src[0];\r
+#else\r
+ temp_r = alpha_r * src[0] + alpha_i * src[1];\r
+ temp_i = -alpha_r * src[1] + alpha_i * src[0];\r
+#endif\r
+\r
+ *dest += temp_r;\r
+ *(dest+1) += temp_i;\r
+\r
+ src+=2;\r
+ dest += inc_dest;\r
+ }\r
+ return;\r
+ } else {\r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+#if !defined(XCONJ) \r
+\r
+ register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};\r
+ register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};\r
+\r
+#else\r
+ register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};\r
+ register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};\r
+#endif\r
+\r
+ register __vector float *vptr_src = (__vector float *) src;\r
+ register __vector float *vptr_y = (__vector float *) dest; \r
+ for (i = 0; i < n/2; i += 2 ){\r
+\r
+ register __vector float vy_0 = vptr_y[i];\r
+ register __vector float vy_1 = vptr_y[i +1]; \r
+\r
+ register __vector float vsrc = vptr_src[i];\r
+ register __vector float vsrc_1 = vptr_src[i + 1]; \r
+ register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);\r
+ register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);\r
+\r
+ vy_0 += vsrc*valpha_r + vsrcx*valpha_i;\r
+ vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; \r
+ vptr_y[i] = vy_0;\r
+ vptr_y[i+1 ] = vy_1; \r
+\r
+ }\r
+ \r
+ }\r
+ return;\r
+}\r
+\r
+\r
+\r
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {\r
+ BLASLONG i;\r
+ FLOAT *a_ptr;\r
+ FLOAT *x_ptr;\r
+ FLOAT *y_ptr;\r
+\r
+ BLASLONG n1;\r
+ BLASLONG m1;\r
+ BLASLONG m2;\r
+ BLASLONG m3;\r
+ BLASLONG n2;\r
+\r
+ FLOAT xbuffer[8], *ybuffer;\r
+\r
+ if (m < 1) return (0);\r
+ if (n < 1) return (0);\r
+\r
+ ybuffer = buffer;\r
+\r
+ inc_x *= 2;\r
+ inc_y *= 2;\r
+ lda *= 2;\r
+\r
+ n1 = n / 4;\r
+ n2 = n % 4;\r
+\r
+ m3 = m % 4;\r
+ m1 = m - (m % 4);\r
+ m2 = (m % NBMAX) - (m % 4);\r
+\r
+ y_ptr = y;\r
+\r
+ BLASLONG NB = NBMAX;\r
+\r
+ while (NB == NBMAX) {\r
+\r
+ m1 -= NB;\r
+ if (m1 < 0) {\r
+ if (m2 == 0) break;\r
+ NB = m2;\r
+ }\r
+\r
+ a_ptr = a;\r
+\r
+ x_ptr = x; \r
+\r
+ memset(ybuffer, 0, NB * 2*sizeof(FLOAT)); \r
+\r
+ if (inc_x == 2) {\r
+\r
+ for (i = 0; i < n1; i++) {\r
+ cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);\r
+\r
+ a_ptr += lda << 2;\r
+ x_ptr += 8;\r
+ }\r
+\r
+ if (n2 & 2) {\r
+ cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);\r
+ x_ptr += 4;\r
+ a_ptr += 2 * lda;\r
+\r
+ }\r
+\r
+ if (n2 & 1) {\r
+ cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);\r
+ x_ptr += 2;\r
+ a_ptr += lda;\r
+\r
+ }\r
+ } else {\r
+\r
+ for (i = 0; i < n1; i++) {\r
+\r
+ xbuffer[0] = x_ptr[0];\r
+ xbuffer[1] = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ xbuffer[2] = x_ptr[0];\r
+ xbuffer[3] = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ xbuffer[4] = x_ptr[0];\r
+ xbuffer[5] = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ xbuffer[6] = x_ptr[0];\r
+ xbuffer[7] = x_ptr[1];\r
+ x_ptr += inc_x;\r
+\r
+ cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);\r
+\r
+ a_ptr += lda << 2;\r
+ }\r
+\r
+ for (i = 0; i < n2; i++) {\r
+ xbuffer[0] = x_ptr[0];\r
+ xbuffer[1] = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);\r
+ a_ptr += lda;\r
+\r
+ }\r
+\r
+ }\r
+\r
+ add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);\r
+ a += 2 * NB;\r
+ y_ptr += NB * inc_y;\r
+ }\r
+\r
+ if (m3 == 0) return (0);\r
+\r
+ if (m3 == 1) {\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ FLOAT temp_r = 0.0;\r
+ FLOAT temp_i = 0.0;\r
+\r
+ if (lda == 2 && inc_x == 2) {\r
+\r
+ for (i = 0; i < (n & -2); i += 2) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+ temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];\r
+ temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];\r
+#else\r
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+ temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];\r
+ temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];\r
+#endif\r
+\r
+ a_ptr += 4;\r
+ x_ptr += 4;\r
+ }\r
+\r
+ for (; i < n; i++) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+#else\r
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+#endif\r
+\r
+ a_ptr += 2;\r
+ x_ptr += 2;\r
+ }\r
+\r
+ } else {\r
+\r
+ for (i = 0; i < n; i++) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+#else\r
+ temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ x_ptr += inc_x;\r
+ }\r
+\r
+ }\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;\r
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;\r
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;\r
+#endif\r
+ return (0);\r
+ }\r
+\r
+ if (m3 == 2) {\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ FLOAT temp_r0 = 0.0;\r
+ FLOAT temp_i0 = 0.0;\r
+ FLOAT temp_r1 = 0.0;\r
+ FLOAT temp_i1 = 0.0;\r
+\r
+ if (lda == 4 && inc_x == 2) {\r
+\r
+ for (i = 0; i < (n & -2); i += 2) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+\r
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];\r
+\r
+ temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];\r
+ temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];\r
+ temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];\r
+ temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];\r
+#else\r
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];\r
+\r
+ temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];\r
+ temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];\r
+ temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];\r
+ temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];\r
+#endif\r
+\r
+ a_ptr += 8;\r
+ x_ptr += 4;\r
+ }\r
+\r
+ for (; i < n; i++) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];\r
+#else\r
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];\r
+#endif\r
+\r
+ a_ptr += 4;\r
+ x_ptr += 2;\r
+ }\r
+\r
+ } else {\r
+\r
+ for (i = 0; i < n; i++) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];\r
+#else\r
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ x_ptr += inc_x;\r
+ }\r
+\r
+ }\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;\r
+ y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;\r
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;\r
+ y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;\r
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;\r
+#endif\r
+ return (0);\r
+ }\r
+\r
+ if (m3 == 3) {\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ FLOAT temp_r0 = 0.0;\r
+ FLOAT temp_i0 = 0.0;\r
+ FLOAT temp_r1 = 0.0;\r
+ FLOAT temp_i1 = 0.0;\r
+ FLOAT temp_r2 = 0.0;\r
+ FLOAT temp_i2 = 0.0;\r
+\r
+ if (lda == 6 && inc_x == 2) {\r
+\r
+ for (i = 0; i < n; i++) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];\r
+ temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];\r
+ temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];\r
+#else\r
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];\r
+ temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];\r
+ temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];\r
+#endif\r
+\r
+ a_ptr += 6;\r
+ x_ptr += 2;\r
+ }\r
+\r
+ } else {\r
+\r
+ for (i = 0; i < n; i++) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];\r
+ temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];\r
+ temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];\r
+#else\r
+ temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];\r
+ temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];\r
+ temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];\r
+ temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];\r
+ temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ x_ptr += inc_x;\r
+ }\r
+\r
+ }\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;\r
+ y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;\r
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;\r
+ y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;\r
+ y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;\r
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;\r
+ y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;\r
+#endif\r
+ return (0);\r
+ }\r
+\r
+ return (0);\r
+}\r
+\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+\r
+#include "common.h"\r
+\r
+#define NBMAX 1024 \r
+#include <altivec.h> \r
+static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};\r
+\r
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {\r
+ BLASLONG i;\r
+ FLOAT *a0, *a1, *a2, *a3;\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ a2 = a1 + lda;\r
+ a3 = a2 + lda;\r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+ //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)\r
+ register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* va1 = (__vector float*) a1;\r
+ __vector float* va2 = (__vector float*) a2;\r
+ __vector float* va3 = (__vector float*) a3;\r
+ __vector float* v_x = (__vector float*) x;\r
+\r
+ for (i = 0; i < n / 2; i+=2) {\r
+ register __vector float vx_0 = v_x[i]; \r
+ register __vector float vx_1 = v_x[i+1]; \r
+ register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);\r
+ register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);\r
+\r
+ vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;\r
+ vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; \r
+ vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];\r
+ vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; \r
+ vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];\r
+ vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; \r
+ vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];\r
+ vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; \r
+\r
+ }\r
+\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+\r
+ register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];\r
+ register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];\r
+\r
+ register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];\r
+ register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];\r
+\r
+ register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];\r
+ register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];\r
+\r
+ register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];\r
+ register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];\r
+\r
+#else\r
+ register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];\r
+ register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];\r
+\r
+ register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];\r
+ register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];\r
+\r
+ register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];\r
+ register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];\r
+\r
+ register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];\r
+ register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];\r
+\r
+#endif \r
+\r
+#if !defined(XCONJ)\r
+\r
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;\r
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;\r
+ y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;\r
+ y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;\r
+ y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;\r
+ y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;\r
+ y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;\r
+ y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;\r
+\r
+#else\r
+\r
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;\r
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;\r
+ y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;\r
+ y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;\r
+ y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;\r
+ y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;\r
+ y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;\r
+ y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;\r
+\r
+#endif\r
+\r
+}\r
+ \r
+\r
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {\r
+ BLASLONG i;\r
+ FLOAT *a0, *a1;\r
+ a0 = ap;\r
+ a1 = ap + lda; \r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+ //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)\r
+ register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; \r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* va1 = (__vector float*) a1; \r
+ __vector float* v_x = (__vector float*) x;\r
+\r
+ for (i = 0; i < n / 2; i+=2) {\r
+ register __vector float vx_0 = v_x[i]; \r
+ register __vector float vx_1 = v_x[i+1]; \r
+ register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);\r
+ register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);\r
+\r
+ vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;\r
+ vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; \r
+ vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];\r
+ vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; \r
+\r
+ }\r
+\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+\r
+ register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];\r
+ register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];\r
+\r
+ register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];\r
+ register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];\r
+ \r
+\r
+#else\r
+ register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];\r
+ register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];\r
+\r
+ register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];\r
+ register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; \r
+\r
+#endif \r
+\r
+#if !defined(XCONJ)\r
+\r
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;\r
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;\r
+ y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;\r
+ y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; \r
+\r
+#else\r
+\r
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;\r
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;\r
+ y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;\r
+ y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; \r
+\r
+#endif\r
+ \r
+}\r
+ \r
+\r
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {\r
+ BLASLONG i; \r
+ __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);\r
+ //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)\r
+ register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};\r
+ register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; \r
+ __vector float* va0 = (__vector float*) ap; \r
+ __vector float* v_x = (__vector float*) x;\r
+\r
+ for (i = 0; i < n / 2; i+=2) {\r
+ register __vector float vx_0 = v_x[i]; \r
+ register __vector float vx_1 = v_x[i+1]; \r
+ register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);\r
+ register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);\r
+\r
+ vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;\r
+ vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; \r
+\r
+ }\r
+\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+\r
+ register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];\r
+ register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; \r
+\r
+#else\r
+ register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];\r
+ register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; \r
+\r
+#endif \r
+\r
+#if !defined(XCONJ)\r
+\r
+ y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;\r
+ y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; \r
+\r
+#else\r
+\r
+ y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;\r
+ y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; \r
+\r
+#endif\r
+\r
+\r
+}\r
+ \r
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {\r
+ BLASLONG i;\r
+ for (i = 0; i < n; i++) {\r
+ *dest = *src;\r
+ *(dest + 1) = *(src + 1);\r
+ dest += 2;\r
+ src += inc_src;\r
+ }\r
+}\r
+\r
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {\r
+ BLASLONG i;\r
+ BLASLONG j;\r
+ FLOAT *a_ptr;\r
+ FLOAT *x_ptr;\r
+ FLOAT *y_ptr;\r
+\r
+ BLASLONG n1;\r
+ BLASLONG m1;\r
+ BLASLONG m2;\r
+ BLASLONG m3;\r
+ BLASLONG n2;\r
+\r
+ FLOAT ybuffer[8], *xbuffer;\r
+\r
+ if (m < 1) return (0);\r
+ if (n < 1) return (0);\r
+\r
+ inc_x <<= 1;\r
+ inc_y <<= 1;\r
+ lda <<= 1;\r
+\r
+ xbuffer = buffer;\r
+\r
+ n1 = n >> 2;\r
+ n2 = n & 3;\r
+\r
+ m3 = m & 3;\r
+ m1 = m - m3;\r
+ m2 = (m & (NBMAX - 1)) - m3;\r
+\r
+ BLASLONG NB = NBMAX;\r
+\r
+ while (NB == NBMAX) {\r
+\r
+ m1 -= NB;\r
+ if (m1 < 0) {\r
+ if (m2 == 0) break;\r
+ NB = m2;\r
+ }\r
+\r
+ y_ptr = y;\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+\r
+ if (inc_x != 2)\r
+ copy_x(NB, x_ptr, xbuffer, inc_x);\r
+ else\r
+ xbuffer = x_ptr;\r
+\r
+ if (inc_y == 2) {\r
+\r
+ for (i = 0; i < n1; i++) {\r
+ cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);\r
+ a_ptr += lda << 2;\r
+ y_ptr += 8;\r
+\r
+ }\r
+\r
+ if (n2 & 2) {\r
+ cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);\r
+ a_ptr += lda << 1;\r
+ y_ptr += 4;\r
+\r
+ }\r
+\r
+ if (n2 & 1) {\r
+ cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);\r
+ a_ptr += lda;\r
+ y_ptr += 2;\r
+\r
+ }\r
+\r
+ } else {\r
+\r
+ for (i = 0; i < n1; i++) {\r
+ memset(ybuffer, 0, sizeof (ybuffer));\r
+ cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);\r
+\r
+ a_ptr += lda << 2;\r
+\r
+ y_ptr[0] += ybuffer[0];\r
+ y_ptr[1] += ybuffer[1];\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += ybuffer[2];\r
+ y_ptr[1] += ybuffer[3];\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += ybuffer[4];\r
+ y_ptr[1] += ybuffer[5];\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += ybuffer[6];\r
+ y_ptr[1] += ybuffer[7];\r
+ y_ptr += inc_y;\r
+\r
+ }\r
+\r
+ for (i = 0; i < n2; i++) {\r
+ memset(ybuffer, 0, sizeof (ybuffer));\r
+ cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);\r
+ a_ptr += lda;\r
+ y_ptr[0] += ybuffer[0];\r
+ y_ptr[1] += ybuffer[1];\r
+ y_ptr += inc_y;\r
+\r
+ }\r
+\r
+ }\r
+ a += 2 * NB;\r
+ x += NB * inc_x;\r
+ }\r
+\r
+ if (m3 == 0) return (0);\r
+\r
+ x_ptr = x;\r
+ j = 0;\r
+ a_ptr = a;\r
+ y_ptr = y;\r
+\r
+ if (m3 == 3) {\r
+\r
+ FLOAT temp_r;\r
+ FLOAT temp_i;\r
+ FLOAT x0 = x_ptr[0];\r
+ FLOAT x1 = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ FLOAT x2 = x_ptr[0];\r
+ FLOAT x3 = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ FLOAT x4 = x_ptr[0];\r
+ FLOAT x5 = x_ptr[1];\r
+ while (j < n) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;\r
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;\r
+ temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;\r
+ temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;\r
+#else\r
+\r
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;\r
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;\r
+ temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;\r
+ temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;\r
+#endif\r
+\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;\r
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;\r
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+ j++;\r
+ }\r
+ return (0);\r
+ }\r
+\r
+ if (m3 == 2) {\r
+\r
+ FLOAT temp_r;\r
+ FLOAT temp_i;\r
+ FLOAT temp_r1;\r
+ FLOAT temp_i1;\r
+ FLOAT x0 = x_ptr[0];\r
+ FLOAT x1 = x_ptr[1];\r
+ x_ptr += inc_x;\r
+ FLOAT x2 = x_ptr[0];\r
+ FLOAT x3 = x_ptr[1];\r
+\r
+ while (j < (n & -2)) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;\r
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;\r
+ a_ptr += lda;\r
+ temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+ temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;\r
+ temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;\r
+#else\r
+\r
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;\r
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;\r
+ a_ptr += lda;\r
+ temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+ temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;\r
+ temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;\r
+#endif\r
+\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;\r
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;\r
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;\r
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;\r
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+ j += 2;\r
+ }\r
+\r
+ while (j < n) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+ temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;\r
+ temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;\r
+#else\r
+\r
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+ temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;\r
+ temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;\r
+#endif\r
+\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;\r
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;\r
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+ j++;\r
+ }\r
+\r
+ return (0);\r
+ }\r
+\r
+ if (m3 == 1) {\r
+\r
+ FLOAT temp_r;\r
+ FLOAT temp_i;\r
+ FLOAT temp_r1;\r
+ FLOAT temp_i1;\r
+ FLOAT x0 = x_ptr[0];\r
+ FLOAT x1 = x_ptr[1];\r
+\r
+ while (j < (n & -2)) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+ a_ptr += lda;\r
+ temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+#else\r
+\r
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+ a_ptr += lda;\r
+ temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+#endif\r
+\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;\r
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;\r
+ y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;\r
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;\r
+ y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+ j += 2;\r
+ }\r
+\r
+ while (j < n) {\r
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )\r
+ temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;\r
+#else\r
+\r
+ temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;\r
+ temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;\r
+#endif\r
+\r
+#if !defined(XCONJ) \r
+ y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;\r
+ y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;\r
+#else\r
+ y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;\r
+ y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;\r
+#endif\r
+\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+ j++;\r
+ }\r
+ return (0);\r
+ }\r
+\r
+ return (0);\r
+\r
+}\r
+\r
#include "common.h"
-#define NBMAX 8192
-#define PREFETCH 1
+#define NBMAX 1024
+//#define PREFETCH 1
#include <altivec.h>
#define HAVE_KERNEL4x8_ASM 1
#endif\r
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])\r
\r
+#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code\r
\r
+#if !defined(USE_MASK_PERMUTATIONS)\r
+\r
+static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){\r
+ __vector float result;\r
+ __asm__ ( \r
+ "vmrgew %0,%1,%2;\n" \r
+ : "=v" (result) \r
+ : "v" (a), \r
+ "v" (b) \r
+ : );\r
+ return result;\r
+}\r
+\r
+static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){\r
+ __vector float result;\r
+ __asm__ ( \r
+ "vmrgow %0,%1,%2;\n" \r
+ : "=v" (result) \r
+ : "v" (a), \r
+ "v" (b) \r
+ : );\r
+ return result;\r
+}\r
+\r
+#endif\r
\r
- \r
/**\r
* Find maximum index \r
* Warning: requirements n>0 and n % 32 == 0\r
\r
BLASLONG index;\r
BLASLONG i;\r
+#if defined(USE_MASK_PERMUTATIONS) \r
register __vector unsigned int static_index0 = {0,1,2,3};\r
+#else\r
+ register __vector unsigned int static_index0 = {2,0,3,1};\r
+#endif \r
register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register\r
register __vector unsigned int temp1= temp0<<1; //{8,8,8,8}\r
- register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};\r
- register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};\r
- register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};\r
+ register __vector unsigned int static_index1=static_index0 +temp0; \r
+ register __vector unsigned int static_index2=static_index0 +temp1; \r
+ register __vector unsigned int static_index3=static_index1 +temp1; \r
temp0=vec_xor(temp0,temp0);\r
temp1=temp1 <<1 ; //{16,16,16,16}\r
register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}\r
register __vector float quadruple_values={0,0,0,0};\r
\r
register __vector float * v_ptrx=(__vector float *)x;\r
+#if defined(USE_MASK_PERMUTATIONS) \r
register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; \r
register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; \r
- for(; i<n; i+=32){\r
+#endif \r
+ for(; i<n; i+=32 ){\r
//absolute temporary complex vectors\r
register __vector float v0=vec_abs(v_ptrx[0]);\r
register __vector float v1=vec_abs(v_ptrx[1]);\r
register __vector float v7=vec_abs(v_ptrx[7]);\r
\r
//pack complex real and imaginary parts together to sum real+image\r
+#if defined(USE_MASK_PERMUTATIONS) \r
register __vector float t1=vec_perm(v0,v1,real_pack_mask);\r
- register __vector float ti=vec_perm(v0,v1,image_pack_mask); \r
+ register __vector float ti=vec_perm(v0,v1,image_pack_mask); \r
+ \r
v0=t1+ti; //sum quadruple real with quadruple image\r
register __vector float t2=vec_perm(v2,v3,real_pack_mask);\r
register __vector float ti2=vec_perm(v2,v3,image_pack_mask); \r
t2=vec_perm(v6,v7,real_pack_mask);\r
ti2=vec_perm(v6,v7,image_pack_mask); \r
v3=t2+ti2;\r
+#else\r
+ register __vector float t1=mvec_mergee(v0,v1);\r
+ register __vector float ti=mvec_mergeo(v0,v1); \r
+ \r
+ v0=t1+ti; //sum quadruple real with quadruple image\r
+ register __vector float t2= mvec_mergee(v2,v3);\r
+ register __vector float ti2=mvec_mergeo(v2,v3); \r
+ v1=t2+ti2;\r
+ t1=mvec_mergee(v4,v5);\r
+ ti=mvec_mergeo(v4,v5); \r
+ v2=t1+ti; //sum\r
+ t2=mvec_mergee(v6,v7);\r
+ ti2=mvec_mergeo(v6,v7); \r
+ v3=t2+ti2;\r
+\r
+#endif\r
// now we have 16 summed elements . lets compare them\r
v_ptrx+=8;\r
register __vector bool int r1=vec_cmpgt(v1,v0);\r
v7=vec_abs(v_ptrx[7]);\r
\r
//pack complex real and imaginary parts together to sum real+image\r
+#if defined(USE_MASK_PERMUTATIONS) \r
t1=vec_perm(v0,v1,real_pack_mask);\r
- ti=vec_perm(v0,v1,image_pack_mask); \r
+ ti=vec_perm(v0,v1,image_pack_mask); \r
+ \r
v0=t1+ti; //sum quadruple real with quadruple image\r
t2=vec_perm(v2,v3,real_pack_mask);\r
ti2=vec_perm(v2,v3,image_pack_mask); \r
t2=vec_perm(v6,v7,real_pack_mask);\r
ti2=vec_perm(v6,v7,image_pack_mask); \r
v3=t2+ti2;\r
+#else\r
+ t1=mvec_mergee(v0,v1);\r
+ ti=mvec_mergeo(v0,v1); \r
+ \r
+ v0=t1+ti; //sum quadruple real with quadruple image\r
+ t2=mvec_mergee(v2,v3);\r
+ ti2=mvec_mergeo(v2,v3); \r
+ v1=t2+ti2;\r
+ t1=mvec_mergee(v4,v5);\r
+ ti=mvec_mergeo(v4,v5); \r
+ v2=t1+ti; //sum\r
+ t2=mvec_mergee(v6,v7);\r
+ ti2=mvec_mergeo(v6,v7); \r
+ v3=t2+ti2;\r
+\r
+#endif\r
// now we have 16 summed elements {from 16 to 31} . lets compare them\r
v_ptrx+=8;\r
r1=vec_cmpgt(v1,v0);\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+*****************************************************************************/\r
+\r
+\r
+#include "common.h"\r
+\r
+#define NBMAX 2048\r
+\r
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)\r
+{\r
+\r
+ BLASLONG i;\r
+ FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; \r
+ FLOAT x0,x1,x2,x3,x4,x5,x6,x7;\r
+ a0 = ap[0];\r
+ a1 = ap[1];\r
+ a2 = ap[2];\r
+ a3 = ap[3]; \r
+ b0 = a0 + lda4 ;\r
+ b1 = a1 + lda4 ;\r
+ b2 = a2 + lda4 ;\r
+ b3 = a3 + lda4 ;\r
+ x0 = xo[0] * *alpha;\r
+ x1 = xo[1] * *alpha;\r
+ x2 = xo[2] * *alpha;\r
+ x3 = xo[3] * *alpha;\r
+ x4 = xo[4] * *alpha;\r
+ x5 = xo[5] * *alpha;\r
+ x6 = xo[6] * *alpha;\r
+ x7 = xo[7] * *alpha;\r
+ __vector float* va0 = (__vector float*)a0;\r
+ __vector float* va1 = (__vector float*)a1;\r
+ __vector float* va2 = (__vector float*)a2;\r
+ __vector float* va3 = (__vector float*)a3;\r
+ __vector float* vb0 = (__vector float*)b0;\r
+ __vector float* vb1 = (__vector float*)b1;\r
+ __vector float* vb2 = (__vector float*)b2;\r
+ __vector float* vb3 = (__vector float*)b3; \r
+ \r
+ __vector float v_x0 = {x0,x0,x0,x0};\r
+ __vector float v_x1 = {x1,x1,x1,x1};\r
+ __vector float v_x2 = {x2,x2,x2,x2};\r
+ __vector float v_x3 = {x3,x3,x3,x3};\r
+ __vector float v_x4 = {x4,x4,x4,x4};\r
+ __vector float v_x5 = {x5,x5,x5,x5};\r
+ __vector float v_x6 = {x6,x6,x6,x6};\r
+ __vector float v_x7 = {x7,x7,x7,x7};\r
+ __vector float* v_y =(__vector float*)y; \r
+ \r
+ for ( i=0; i< n/4; i++)\r
+ {\r
+ register __vector float vy=v_y[i];\r
+ vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; \r
+ vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ;\r
+ v_y[i] =vy; \r
+ }\r
+\r
+}\r
+ \r
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)\r
+{\r
+ BLASLONG i;\r
+ FLOAT x0,x1,x2,x3;\r
+ x0 = xo[0] * *alpha;\r
+ x1 = xo[1] * *alpha;\r
+ x2 = xo[2] * *alpha;\r
+ x3 = xo[3] * *alpha;\r
+ __vector float v_x0 = {x0,x0,x0,x0};\r
+ __vector float v_x1 = {x1,x1,x1,x1};\r
+ __vector float v_x2 = {x2,x2,x2,x2};\r
+ __vector float v_x3 = {x3,x3,x3,x3};\r
+ __vector float* v_y =(__vector float*)y; \r
+ __vector float* va0 = (__vector float*)ap[0];\r
+ __vector float* va1 = (__vector float*)ap[1];\r
+ __vector float* va2 = (__vector float*)ap[2];\r
+ __vector float* va3 = (__vector float*)ap[3]; \r
+ \r
+ for ( i=0; i< n/4; i++ )\r
+ {\r
+ register __vector float vy=v_y[i];\r
+ vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; \r
+ v_y[i] =vy; \r
+ }\r
+\r
+} \r
+\r
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)\r
+{\r
+\r
+ BLASLONG i;\r
+ FLOAT x0,x1;\r
+ x0 = x[0] * *alpha;\r
+ x1 = x[1] * *alpha; \r
+ __vector float v_x0 = {x0,x0,x0,x0};\r
+ __vector float v_x1 = {x1,x1,x1,x1}; \r
+ __vector float* v_y =(__vector float*)y; \r
+ __vector float* va0 = (__vector float*)ap[0];\r
+ __vector float* va1 = (__vector float*)ap[1]; \r
+ \r
+ for ( i=0; i< n/4; i++ )\r
+ { \r
+ v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; \r
+ }\r
+\r
+} \r
+ \r
+ \r
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)\r
+{\r
+\r
+ BLASLONG i;\r
+ FLOAT x0 ;\r
+ x0 = x[0] * *alpha; \r
+ __vector float v_x0 = {x0,x0,x0,x0}; \r
+ __vector float* v_y =(__vector float*)y; \r
+ __vector float* va0 = (__vector float*)ap; \r
+ \r
+ for ( i=0; i< n/4; i++ )\r
+ { \r
+ v_y[i] += v_x0 * va0[i] ; \r
+ }\r
+\r
+}\r
+ \r
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)\r
+{\r
+ BLASLONG i;\r
+ \r
+ for ( i=0; i<n; i++ ){\r
+ *dest += *src;\r
+ src++;\r
+ dest += inc_dest;\r
+ }\r
+ return;\r
+ \r
+\r
+}\r
+\r
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)\r
+{\r
+ BLASLONG i;\r
+ FLOAT *a_ptr;\r
+ FLOAT *x_ptr;\r
+ FLOAT *y_ptr;\r
+ FLOAT *ap[4];\r
+ BLASLONG n1;\r
+ BLASLONG m1;\r
+ BLASLONG m2;\r
+ BLASLONG m3;\r
+ BLASLONG n2;\r
+ BLASLONG lda4 = lda << 2;\r
+ BLASLONG lda8 = lda << 3;\r
+ FLOAT xbuffer[8],*ybuffer;\r
+\r
+ if ( m < 1 ) return(0);\r
+ if ( n < 1 ) return(0);\r
+\r
+ ybuffer = buffer;\r
+ \r
+ if ( inc_x == 1 )\r
+ {\r
+ n1 = n >> 3 ;\r
+ n2 = n & 7 ;\r
+ }\r
+ else\r
+ {\r
+ n1 = n >> 2 ;\r
+ n2 = n & 3 ;\r
+\r
+ }\r
+ \r
+ m3 = m & 3 ;\r
+ m1 = m & -4 ;\r
+ m2 = (m & (NBMAX-1)) - m3 ;\r
+\r
+\r
+ y_ptr = y;\r
+\r
+ BLASLONG NB = NBMAX;\r
+\r
+ while ( NB == NBMAX )\r
+ {\r
+ \r
+ m1 -= NB;\r
+ if ( m1 < 0)\r
+ {\r
+ if ( m2 == 0 ) break; \r
+ NB = m2;\r
+ }\r
+ \r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ \r
+ ap[0] = a_ptr;\r
+ ap[1] = a_ptr + lda;\r
+ ap[2] = ap[1] + lda;\r
+ ap[3] = ap[2] + lda;\r
+\r
+ if ( inc_y != 1 )\r
+ memset(ybuffer,0,NB*4);\r
+ else\r
+ ybuffer = y_ptr;\r
+\r
+ if ( inc_x == 1 )\r
+ {\r
+\r
+\r
+ for( i = 0; i < n1 ; i++)\r
+ {\r
+ sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);\r
+ ap[0] += lda8; \r
+ ap[1] += lda8; \r
+ ap[2] += lda8; \r
+ ap[3] += lda8; \r
+ a_ptr += lda8;\r
+ x_ptr += 8; \r
+ }\r
+\r
+\r
+ if ( n2 & 4 )\r
+ {\r
+ sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);\r
+ ap[0] += lda4; \r
+ ap[1] += lda4; \r
+ ap[2] += lda4; \r
+ ap[3] += lda4; \r
+ a_ptr += lda4;\r
+ x_ptr += 4; \r
+ }\r
+\r
+ if ( n2 & 2 )\r
+ {\r
+ sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);\r
+ a_ptr += lda*2;\r
+ x_ptr += 2; \r
+ }\r
+\r
+\r
+ if ( n2 & 1 )\r
+ {\r
+ sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); \r
+ a_ptr += lda;\r
+ x_ptr += 1; \r
+ }\r
+\r
+\r
+ }\r
+ else\r
+ {\r
+\r
+ for( i = 0; i < n1 ; i++)\r
+ {\r
+ xbuffer[0] = x_ptr[0];\r
+ x_ptr += inc_x; \r
+ xbuffer[1] = x_ptr[0];\r
+ x_ptr += inc_x; \r
+ xbuffer[2] = x_ptr[0];\r
+ x_ptr += inc_x; \r
+ xbuffer[3] = x_ptr[0];\r
+ x_ptr += inc_x; \r
+ sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);\r
+ ap[0] += lda4; \r
+ ap[1] += lda4; \r
+ ap[2] += lda4; \r
+ ap[3] += lda4; \r
+ a_ptr += lda4;\r
+ }\r
+\r
+ for( i = 0; i < n2 ; i++)\r
+ {\r
+ xbuffer[0] = x_ptr[0];\r
+ x_ptr += inc_x; \r
+ sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);\r
+ a_ptr += lda;\r
+\r
+ }\r
+\r
+ }\r
+\r
+ a += NB;\r
+ if ( inc_y != 1 )\r
+ {\r
+ add_y(NB,ybuffer,y_ptr,inc_y);\r
+ y_ptr += NB * inc_y;\r
+ }\r
+ else\r
+ y_ptr += NB ;\r
+\r
+ }\r
+\r
+ if ( m3 == 0 ) return(0);\r
+\r
+ if ( m3 == 3 )\r
+ {\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ FLOAT temp0 = 0.0;\r
+ FLOAT temp1 = 0.0;\r
+ FLOAT temp2 = 0.0;\r
+ if ( lda == 3 && inc_x ==1 )\r
+ {\r
+\r
+ for( i = 0; i < ( n & -4 ); i+=4 )\r
+ {\r
+\r
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];\r
+ temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];\r
+\r
+ temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];\r
+ temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];\r
+ temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];\r
+\r
+ a_ptr += 12;\r
+ x_ptr += 4;\r
+ }\r
+\r
+ for( ; i < n; i++ )\r
+ {\r
+ temp0 += a_ptr[0] * x_ptr[0];\r
+ temp1 += a_ptr[1] * x_ptr[0];\r
+ temp2 += a_ptr[2] * x_ptr[0];\r
+ a_ptr += 3;\r
+ x_ptr ++;\r
+ }\r
+\r
+ }\r
+ else\r
+ {\r
+\r
+ for( i = 0; i < n; i++ )\r
+ {\r
+ temp0 += a_ptr[0] * x_ptr[0];\r
+ temp1 += a_ptr[1] * x_ptr[0];\r
+ temp2 += a_ptr[2] * x_ptr[0];\r
+ a_ptr += lda;\r
+ x_ptr += inc_x;\r
+\r
+\r
+ }\r
+\r
+ }\r
+ y_ptr[0] += alpha * temp0;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha * temp1;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha * temp2;\r
+ return(0);\r
+ }\r
+\r
+\r
+ if ( m3 == 2 )\r
+ {\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ FLOAT temp0 = 0.0;\r
+ FLOAT temp1 = 0.0;\r
+ if ( lda == 2 && inc_x ==1 )\r
+ {\r
+\r
+ for( i = 0; i < (n & -4) ; i+=4 )\r
+ {\r
+ temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];\r
+ temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];\r
+ temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];\r
+ temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];\r
+ a_ptr += 8;\r
+ x_ptr += 4;\r
+\r
+ }\r
+\r
+\r
+ for( ; i < n; i++ )\r
+ {\r
+ temp0 += a_ptr[0] * x_ptr[0];\r
+ temp1 += a_ptr[1] * x_ptr[0];\r
+ a_ptr += 2;\r
+ x_ptr ++;\r
+ }\r
+\r
+ }\r
+ else\r
+ {\r
+\r
+ for( i = 0; i < n; i++ )\r
+ {\r
+ temp0 += a_ptr[0] * x_ptr[0];\r
+ temp1 += a_ptr[1] * x_ptr[0];\r
+ a_ptr += lda;\r
+ x_ptr += inc_x;\r
+\r
+\r
+ }\r
+\r
+ }\r
+ y_ptr[0] += alpha * temp0;\r
+ y_ptr += inc_y;\r
+ y_ptr[0] += alpha * temp1;\r
+ return(0);\r
+ }\r
+\r
+ if ( m3 == 1 )\r
+ {\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+ FLOAT temp = 0.0;\r
+ if ( lda == 1 && inc_x ==1 )\r
+ {\r
+\r
+ for( i = 0; i < (n & -4); i+=4 )\r
+ {\r
+ temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];\r
+ \r
+ }\r
+\r
+ for( ; i < n; i++ )\r
+ {\r
+ temp += a_ptr[i] * x_ptr[i];\r
+ }\r
+\r
+ }\r
+ else\r
+ {\r
+\r
+ for( i = 0; i < n; i++ )\r
+ {\r
+ temp += a_ptr[0] * x_ptr[0];\r
+ a_ptr += lda;\r
+ x_ptr += inc_x;\r
+ }\r
+\r
+ }\r
+ y_ptr[0] += alpha * temp;\r
+ return(0);\r
+ }\r
+\r
+\r
+ return(0);\r
+}\r
+\r
+\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+\r
+#include "common.h"\r
+\r
+#define NBMAX 2048\r
+\r
+#include <altivec.h> \r
+ \r
+static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {\r
+ BLASLONG i; \r
+ FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;\r
+ __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;\r
+ register __vector float temp0 = {0,0,0,0};\r
+ register __vector float temp1 = {0,0,0,0};\r
+ register __vector float temp2 = {0,0,0,0};\r
+ register __vector float temp3 = {0,0,0,0};\r
+ register __vector float temp4 = {0,0,0,0};\r
+ register __vector float temp5 = {0,0,0,0};\r
+ register __vector float temp6 = {0,0,0,0};\r
+ register __vector float temp7 = {0,0,0,0};\r
+\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ a2 = a1 + lda;\r
+ a3 = a2 + lda;\r
+ a4 = a3 + lda;\r
+ a5 = a4 + lda;\r
+ a6 = a5 + lda;\r
+ a7 = a6 + lda;\r
+ va0 = (__vector float*) a0;\r
+ va1 = (__vector float*) a1;\r
+ va2 = (__vector float*) a2;\r
+ va3 = (__vector float*) a3;\r
+ va4 = (__vector float*) a4;\r
+ va5 = (__vector float*) a5;\r
+ va6 = (__vector float*) a6;\r
+ va7 = (__vector float*) a7;\r
+ v_x = (__vector float*) x;\r
+ \r
+ \r
+ for (i = 0; i < n/4; i ++) {\r
+ temp0 += v_x[i] * va0[i];\r
+ temp1 += v_x[i] * va1[i];\r
+ temp2 += v_x[i] * va2[i];\r
+ temp3 += v_x[i] * va3[i];\r
+ temp4 += v_x[i] * va4[i];\r
+ temp5 += v_x[i] * va5[i];\r
+ temp6 += v_x[i] * va6[i];\r
+ temp7 += v_x[i] * va7[i]; \r
+ }\r
+ \r
+ \r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+ y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);\r
+ y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);\r
+ y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);\r
+\r
+ y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);\r
+ y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);\r
+ y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);\r
+ y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);\r
+\r
+}\r
+ \r
+\r
+static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {\r
+ BLASLONG i = 0;\r
+ FLOAT *a0, *a1, *a2, *a3;\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ a2 = a1 + lda;\r
+ a3 = a2 + lda;\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* va1 = (__vector float*) a1;\r
+ __vector float* va2 = (__vector float*) a2;\r
+ __vector float* va3 = (__vector float*) a3;\r
+ __vector float* v_x = (__vector float*) x;\r
+ register __vector float temp0 = {0,0,0,0};\r
+ register __vector float temp1 = {0,0,0,0};\r
+ register __vector float temp2 = {0,0,0,0};\r
+ register __vector float temp3 = {0,0,0,0}; \r
+\r
+ for (i = 0; i < n / 4; i ++) {\r
+ temp0 += v_x[i] * va0[i];\r
+ temp1 += v_x[i] * va1[i];\r
+ temp2 += v_x[i] * va2[i];\r
+ temp3 += v_x[i] * va3[i]; \r
+ }\r
+ \r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+ y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);\r
+ y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);\r
+ y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);\r
+\r
+}\r
+ \r
+\r
+static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {\r
+\r
+ BLASLONG i;\r
+ FLOAT *a0, *a1;\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* va1 = (__vector float*) a1;\r
+ __vector float* v_x = (__vector float*) x;\r
+ __vector float temp0 = {0,0,0,0};\r
+ __vector float temp1 = {0,0,0,0};\r
+ for (i = 0; i < n / 4; i ++) {\r
+ temp0 += v_x[i] * va0[i];\r
+ temp1 += v_x[i] * va1[i];\r
+ }\r
+\r
+\r
+\r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+ y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); \r
+}\r
+\r
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {\r
+\r
+ BLASLONG i;\r
+ FLOAT *a0;\r
+ a0 = ap;\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* v_x = (__vector float*) x;\r
+ __vector float temp0 = {0,0,0,0};\r
+ for (i = 0; i < n / 4; i ++) {\r
+ temp0 += v_x[i] * va0[i] ;\r
+ }\r
+\r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+\r
+}\r
+\r
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {\r
+ BLASLONG i;\r
+ for (i = 0; i < n; i++) {\r
+ *dest++ = *src;\r
+ src += inc_src;\r
+ }\r
+}\r
+\r
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {\r
+ BLASLONG i;\r
+ BLASLONG j;\r
+ FLOAT *a_ptr;\r
+ FLOAT *x_ptr;\r
+ FLOAT *y_ptr;\r
+\r
+ BLASLONG n1;\r
+ BLASLONG m1;\r
+ BLASLONG m2;\r
+ BLASLONG m3;\r
+ BLASLONG n2;\r
+\r
+ FLOAT ybuffer[8], *xbuffer;\r
+\r
+ if (m < 1) return (0);\r
+ if (n < 1) return (0);\r
+\r
+ xbuffer = buffer;\r
+\r
+ n1 = n >> 3;\r
+ n2 = n & 7;\r
+\r
+ m3 = m & 3;\r
+ m1 = m - m3;\r
+ m2 = (m & (NBMAX - 1)) - m3;\r
+\r
+ BLASLONG NB = NBMAX;\r
+\r
+ while (NB == NBMAX) {\r
+\r
+ m1 -= NB;\r
+ if (m1 < 0) {\r
+ if (m2 == 0) break;\r
+ NB = m2;\r
+ }\r
+\r
+ y_ptr = y;\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+\r
+ if (inc_x != 1)\r
+ copy_x(NB, x_ptr, xbuffer, inc_x);\r
+ else\r
+ xbuffer = x_ptr;\r
+\r
+ BLASLONG lda8 = lda << 3;\r
+\r
+\r
+ if (inc_y == 1) {\r
+\r
+ for (i = 0; i < n1; i++) {\r
+ \r
+ sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);\r
+ \r
+ y_ptr += 8;\r
+ a_ptr += lda8;\r
+ \r
+ }\r
+\r
+ } else {\r
+ \r
+ for (i = 0; i < n1; i++) {\r
+ ybuffer[0] = 0;\r
+ ybuffer[1] = 0;\r
+ ybuffer[2] = 0;\r
+ ybuffer[3] = 0;\r
+ ybuffer[4] = 0;\r
+ ybuffer[5] = 0;\r
+ ybuffer[6] = 0;\r
+ ybuffer[7] = 0;\r
+ sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);\r
+\r
+ \r
+\r
+ *y_ptr += ybuffer[0];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[1];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[2];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[3];\r
+ y_ptr += inc_y;\r
+\r
+ *y_ptr += ybuffer[4];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[5];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[6];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[7];\r
+ y_ptr += inc_y;\r
+\r
+ a_ptr += lda8;\r
+ }\r
+\r
+ }\r
+\r
+\r
+ if (n2 & 4) {\r
+ ybuffer[0] = 0;\r
+ ybuffer[1] = 0;\r
+ ybuffer[2] = 0;\r
+ ybuffer[3] = 0;\r
+ sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);\r
+\r
+ a_ptr += lda<<2;\r
+\r
+ *y_ptr += ybuffer[0];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[1];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[2];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[3];\r
+ y_ptr += inc_y;\r
+ }\r
+\r
+ if (n2 & 2) {\r
+ sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);\r
+ a_ptr += lda << 1;\r
+ y_ptr += 2 * inc_y;\r
+\r
+ }\r
+\r
+ if (n2 & 1) {\r
+ sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+\r
+ }\r
+\r
+ a += NB;\r
+ x += NB * inc_x;\r
+\r
+\r
+ }\r
+\r
+ if (m3 == 0) return (0);\r
+\r
+ x_ptr = x;\r
+ a_ptr = a;\r
+ if (m3 == 3) {\r
+ FLOAT xtemp0 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp1 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp2 = *x_ptr * alpha;\r
+\r
+ FLOAT *aj = a_ptr;\r
+ y_ptr = y;\r
+\r
+ if (lda == 3 && inc_y == 1) {\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;\r
+ y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;\r
+ y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;\r
+ y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;\r
+ aj += 12;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;\r
+ aj += 3;\r
+ }\r
+\r
+ } else {\r
+\r
+ if (inc_y == 1) {\r
+\r
+ BLASLONG register lda2 = lda << 1;\r
+ BLASLONG register lda4 = lda << 2;\r
+ BLASLONG register lda3 = lda2 + lda;\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;\r
+ y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;\r
+ y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;\r
+ y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;\r
+ aj += lda4;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;\r
+ aj += lda;\r
+ }\r
+\r
+ } else {\r
+\r
+ for (j = 0; j < n; j++) {\r
+ *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;\r
+ y_ptr += inc_y;\r
+ aj += lda;\r
+ }\r
+\r
+ }\r
+\r
+ }\r
+ return (0);\r
+ }\r
+\r
+ if (m3 == 2) {\r
+ FLOAT xtemp0 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp1 = *x_ptr * alpha;\r
+\r
+ FLOAT *aj = a_ptr;\r
+ y_ptr = y;\r
+\r
+ if (lda == 2 && inc_y == 1) {\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;\r
+ y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;\r
+ y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;\r
+ y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;\r
+ aj += 8;\r
+\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;\r
+ aj += 2;\r
+ }\r
+\r
+ } else {\r
+ if (inc_y == 1) {\r
+\r
+ BLASLONG register lda2 = lda << 1;\r
+ BLASLONG register lda4 = lda << 2;\r
+ BLASLONG register lda3 = lda2 + lda;\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;\r
+ y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;\r
+ y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;\r
+ y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;\r
+ aj += lda4;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;\r
+ aj += lda;\r
+ }\r
+\r
+ } else {\r
+ for (j = 0; j < n; j++) {\r
+ *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;\r
+ y_ptr += inc_y;\r
+ aj += lda;\r
+ }\r
+ }\r
+\r
+ }\r
+ return (0);\r
+\r
+ }\r
+\r
+ FLOAT xtemp = *x_ptr * alpha;\r
+ FLOAT *aj = a_ptr;\r
+ y_ptr = y;\r
+ if (lda == 1 && inc_y == 1) {\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += aj[j] * xtemp;\r
+ y_ptr[j + 1] += aj[j + 1] * xtemp;\r
+ y_ptr[j + 2] += aj[j + 2] * xtemp;\r
+ y_ptr[j + 3] += aj[j + 3] * xtemp;\r
+ }\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += aj[j] * xtemp;\r
+ }\r
+\r
+\r
+ } else {\r
+ if (inc_y == 1) {\r
+\r
+ BLASLONG register lda2 = lda << 1;\r
+ BLASLONG register lda4 = lda << 2;\r
+ BLASLONG register lda3 = lda2 + lda;\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += *aj * xtemp;\r
+ y_ptr[j + 1] += *(aj + lda) * xtemp;\r
+ y_ptr[j + 2] += *(aj + lda2) * xtemp;\r
+ y_ptr[j + 3] += *(aj + lda3) * xtemp;\r
+ aj += lda4;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += *aj * xtemp;\r
+ aj += lda;\r
+ }\r
+\r
+ } else {\r
+ for (j = 0; j < n; j++) {\r
+ *y_ptr += *aj * xtemp;\r
+ y_ptr += inc_y;\r
+ aj += lda;\r
+ }\r
+\r
+ }\r
+ }\r
+\r
+ return (0);\r
+\r
+}\r
+\r
--- /dev/null
+/***************************************************************************\r
+Copyright (c) 2019, The OpenBLAS Project\r
+All rights reserved.\r
+Redistribution and use in source and binary forms, with or without\r
+modification, are permitted provided that the following conditions are\r
+met:\r
+1. Redistributions of source code must retain the above copyright\r
+notice, this list of conditions and the following disclaimer.\r
+2. Redistributions in binary form must reproduce the above copyright\r
+notice, this list of conditions and the following disclaimer in\r
+the documentation and/or other materials provided with the\r
+distribution.\r
+3. Neither the name of the OpenBLAS project nor the names of\r
+its contributors may be used to endorse or promote products\r
+derived from this software without specific prior written permission.\r
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"\r
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE\r
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\r
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\r
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\r
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE\r
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+ *****************************************************************************/\r
+\r
+#include "common.h"\r
+#include <stdio.h>\r
+#define NBMAX 2048\r
+\r
+#include <altivec.h> \r
+ \r
+static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {\r
+ BLASLONG i; \r
+ FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;\r
+ __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;\r
+ register __vector float temp0 = {0,0,0,0};\r
+ register __vector float temp1 = {0,0,0,0};\r
+ register __vector float temp2 = {0,0,0,0};\r
+ register __vector float temp3 = {0,0,0,0};\r
+ register __vector float temp4 = {0,0,0,0};\r
+ register __vector float temp5 = {0,0,0,0};\r
+ register __vector float temp6 = {0,0,0,0};\r
+ register __vector float temp7 = {0,0,0,0};\r
+\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ a2 = a1 + lda;\r
+ a3 = a2 + lda;\r
+ a4 = a3 + lda;\r
+ a5 = a4 + lda;\r
+ a6 = a5 + lda;\r
+ a7 = a6 + lda;\r
+ va0 = (__vector float*) a0;\r
+ va1 = (__vector float*) a1;\r
+ va2 = (__vector float*) a2;\r
+ va3 = (__vector float*) a3;\r
+ va4 = (__vector float*) a4;\r
+ va5 = (__vector float*) a5;\r
+ va6 = (__vector float*) a6;\r
+ va7 = (__vector float*) a7;\r
+ v_x = (__vector float*) x;\r
+ \r
+ \r
+ for (i = 0; i < n/4; i +=2) {\r
+ register __vector float vx1=v_x[i] ; \r
+ register __vector float vx2=v_x[i+1] ; \r
+ register __vector float va0_1=va0[i] ; \r
+ register __vector float va0_2=va0[i+1] ; \r
+ register __vector float va1_1=va1[i] ; \r
+ register __vector float va1_2=va1[i+1] ; \r
+ register __vector float va2_1=va2[i] ; \r
+ register __vector float va2_2=va2[i+1] ; \r
+ register __vector float va3_1=va3[i] ; \r
+ register __vector float va3_2=va3[i+1] ; \r
+ register __vector float va4_1=va4[i] ; \r
+ register __vector float va4_2=va4[i+1] ;\r
+ register __vector float va5_1=va5[i] ; \r
+ register __vector float va5_2=va5[i+1] ; \r
+ register __vector float va6_1=va6[i] ; \r
+ register __vector float va6_2=va6[i+1] ; \r
+ register __vector float va7_1=va7[i] ; \r
+ register __vector float va7_2=va7[i+1] ; \r
+ temp0 += vx1* va0_1 + vx2 * va0_2;\r
+ temp1 += vx1* va1_1 + vx2 * va1_2;\r
+ temp2 += vx1* va2_1 + vx2 * va2_2;\r
+ temp3 += vx1* va3_1 + vx2 * va3_2;\r
+ temp4 += vx1* va4_1 + vx2 * va4_2;\r
+ temp5 += vx1* va5_1 + vx2 * va5_2;\r
+ temp6 += vx1* va6_1 + vx2 * va6_2;\r
+ temp7 += vx1* va7_1 + vx2 * va7_2; \r
+ }\r
+ \r
+ \r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+ y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);\r
+ y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);\r
+ y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);\r
+\r
+ y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);\r
+ y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);\r
+ y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);\r
+ y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);\r
+\r
+}\r
+ \r
+\r
+static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {\r
+ BLASLONG i = 0;\r
+ FLOAT *a0, *a1, *a2, *a3;\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ a2 = a1 + lda;\r
+ a3 = a2 + lda;\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* va1 = (__vector float*) a1;\r
+ __vector float* va2 = (__vector float*) a2;\r
+ __vector float* va3 = (__vector float*) a3;\r
+ __vector float* v_x = (__vector float*) x;\r
+ register __vector float temp0 = {0,0,0,0};\r
+ register __vector float temp1 = {0,0,0,0};\r
+ register __vector float temp2 = {0,0,0,0};\r
+ register __vector float temp3 = {0,0,0,0}; \r
+\r
+ for (i = 0; i < n / 4; i +=2) {\r
+ temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];\r
+ temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];\r
+ temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];\r
+ temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; \r
+ }\r
+ \r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+ y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);\r
+ y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);\r
+ y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);\r
+\r
+}\r
+ \r
+\r
+static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {\r
+\r
+ BLASLONG i;\r
+ FLOAT *a0, *a1;\r
+ a0 = ap;\r
+ a1 = ap + lda;\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* va1 = (__vector float*) a1;\r
+ __vector float* v_x = (__vector float*) x;\r
+ __vector float temp0 = {0,0,0,0};\r
+ __vector float temp1 = {0,0,0,0};\r
+ for (i = 0; i < n / 4; i +=2) {\r
+ temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];\r
+ temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; \r
+ }\r
+\r
+\r
+\r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+ y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); \r
+}\r
+\r
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {\r
+\r
+ BLASLONG i;\r
+ FLOAT *a0;\r
+ a0 = ap;\r
+ __vector float* va0 = (__vector float*) a0;\r
+ __vector float* v_x = (__vector float*) x;\r
+ __vector float temp0 = {0,0,0,0};\r
+ for (i = 0; i < n / 4; i +=2) {\r
+ temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; \r
+ }\r
+ y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);\r
+\r
+}\r
+ \r
+\r
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {\r
+ BLASLONG i;\r
+ for (i = 0; i < n; i++) {\r
+ *dest++ = *src;\r
+ src += inc_src;\r
+ }\r
+}\r
+\r
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {\r
+ BLASLONG i;\r
+ BLASLONG j;\r
+ FLOAT *a_ptr;\r
+ FLOAT *x_ptr;\r
+ FLOAT *y_ptr;\r
+\r
+ BLASLONG n1;\r
+ BLASLONG m1;\r
+ BLASLONG m2;\r
+ BLASLONG m3;\r
+ BLASLONG n2;\r
+\r
+ FLOAT ybuffer[8], *xbuffer;\r
+\r
+ if (m < 1) return (0);\r
+ if (n < 1) return (0);\r
+\r
+ xbuffer = buffer;\r
+\r
+ n1 = n >> 3;\r
+ n2 = n & 7;\r
+\r
+ m3 = m & 7;\r
+ m1 = m - m3;\r
+ m2 = (m & (NBMAX - 1)) - m3;\r
+ \r
+ BLASLONG NB = NBMAX;\r
+\r
+ while (NB == NBMAX) {\r
+\r
+ m1 -= NB;\r
+ if (m1 < 0) {\r
+ if (m2 == 0) break;\r
+ NB = m2;\r
+ }\r
+\r
+ y_ptr = y;\r
+ a_ptr = a;\r
+ x_ptr = x;\r
+\r
+ if (inc_x != 1)\r
+ copy_x(NB, x_ptr, xbuffer, inc_x);\r
+ else\r
+ xbuffer = x_ptr;\r
+\r
+ BLASLONG lda8 = lda << 3;\r
+\r
+ \r
+ if (inc_y == 1) {\r
+\r
+ for (i = 0; i < n1; i++) {\r
+ \r
+ sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);\r
+ \r
+ y_ptr += 8;\r
+ a_ptr += lda8;\r
+ \r
+ }\r
+\r
+ } else {\r
+ \r
+ for (i = 0; i < n1; i++) {\r
+ ybuffer[0] = 0;\r
+ ybuffer[1] = 0;\r
+ ybuffer[2] = 0;\r
+ ybuffer[3] = 0;\r
+ ybuffer[4] = 0;\r
+ ybuffer[5] = 0;\r
+ ybuffer[6] = 0;\r
+ ybuffer[7] = 0;\r
+ sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);\r
+\r
+ \r
+\r
+ *y_ptr += ybuffer[0];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[1];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[2];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[3];\r
+ y_ptr += inc_y;\r
+\r
+ *y_ptr += ybuffer[4];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[5];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[6];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[7];\r
+ y_ptr += inc_y;\r
+\r
+ a_ptr += lda8;\r
+ }\r
+\r
+ }\r
+\r
+\r
+ if (n2 & 4) {\r
+ ybuffer[0] = 0;\r
+ ybuffer[1] = 0;\r
+ ybuffer[2] = 0;\r
+ ybuffer[3] = 0;\r
+ sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);\r
+\r
+ a_ptr += lda<<2;\r
+\r
+ *y_ptr += ybuffer[0];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[1];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[2];\r
+ y_ptr += inc_y;\r
+ *y_ptr += ybuffer[3];\r
+ y_ptr += inc_y;\r
+ }\r
+\r
+ if (n2 & 2) {\r
+ sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);\r
+ a_ptr += lda << 1;\r
+ y_ptr += 2 * inc_y;\r
+\r
+ }\r
+\r
+ if (n2 & 1) {\r
+ sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);\r
+ a_ptr += lda;\r
+ y_ptr += inc_y;\r
+\r
+ }\r
+\r
+ a += NB;\r
+ x += NB * inc_x;\r
+\r
+\r
+ }\r
+\r
+ if (m3 == 0) return (0);\r
+\r
+ x_ptr = x;\r
+ a_ptr = a;\r
+ if (m3 & 4) {\r
+ FLOAT xtemp0 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp1 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp2 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp3 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT *aj = a_ptr;\r
+ y_ptr = y;\r
+ if (lda == 4 && inc_y == 1) {\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3;\r
+ y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3;\r
+ y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3;\r
+ y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3;\r
+ aj += 16;\r
+\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3;\r
+ aj += 4;\r
+ }\r
+\r
+ } else if (inc_y == 1) {\r
+ \r
+ BLASLONG register lda2 = lda << 1;\r
+ BLASLONG register lda4 = lda << 2;\r
+ BLASLONG register lda3 = lda2 + lda;\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;\r
+ y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;\r
+ y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3;\r
+ y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3;\r
+ aj += lda4;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;\r
+ aj += lda;\r
+ }\r
+\r
+ } else {\r
+\r
+ for (j = 0; j < n; j++) {\r
+ *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;\r
+ y_ptr += inc_y;\r
+ aj += lda;\r
+ }\r
+\r
+ } \r
+ if (m3==4) return (0);\r
+ a_ptr += 4; \r
+ }\r
+\r
+ if (m3 & 2 ) {\r
+ \r
+ FLOAT xtemp0 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT xtemp1 = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT *aj = a_ptr;\r
+ y_ptr = y;\r
+\r
+ if (lda == 2 && inc_y == 1) {\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;\r
+ y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;\r
+ y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;\r
+ y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;\r
+ aj += 8;\r
+\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;\r
+ aj += 2;\r
+ }\r
+\r
+ } else {\r
+ if (inc_y == 1) {\r
+\r
+ BLASLONG register lda2 = lda << 1;\r
+ BLASLONG register lda4 = lda << 2;\r
+ BLASLONG register lda3 = lda2 + lda;\r
+\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;\r
+ y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;\r
+ y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;\r
+ y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;\r
+ aj += lda4;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+\r
+ y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;\r
+ aj += lda;\r
+ }\r
+\r
+ } else {\r
+ for (j = 0; j < n; j++) {\r
+ *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;\r
+ y_ptr += inc_y;\r
+ aj += lda;\r
+ }\r
+ }\r
+\r
+ } \r
+ if (m3==2) return (0);\r
+ a_ptr += 2; \r
+ }\r
+ if (m3 & 1) {\r
+ \r
+ FLOAT xtemp = *x_ptr * alpha;\r
+ x_ptr += inc_x;\r
+ FLOAT *aj = a_ptr;\r
+ y_ptr = y;\r
+ if (lda == 1 && inc_y == 1) {\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += aj[j] * xtemp;\r
+ y_ptr[j + 1] += aj[j + 1] * xtemp;\r
+ y_ptr[j + 2] += aj[j + 2] * xtemp;\r
+ y_ptr[j + 3] += aj[j + 3] * xtemp;\r
+ }\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += aj[j] * xtemp;\r
+ }\r
+\r
+\r
+ } else {\r
+ if (inc_y == 1) {\r
+\r
+ BLASLONG register lda2 = lda << 1;\r
+ BLASLONG register lda4 = lda << 2;\r
+ BLASLONG register lda3 = lda2 + lda;\r
+ for (j = 0; j < (n & -4); j += 4) {\r
+ y_ptr[j] += *aj * xtemp;\r
+ y_ptr[j + 1] += *(aj + lda) * xtemp;\r
+ y_ptr[j + 2] += *(aj + lda2) * xtemp;\r
+ y_ptr[j + 3] += *(aj + lda3) * xtemp;\r
+ aj += lda4;\r
+ }\r
+\r
+ for (; j < n; j++) {\r
+ y_ptr[j] += *aj * xtemp;\r
+ aj += lda;\r
+ }\r
+\r
+ } else {\r
+ for (j = 0; j < n; j++) {\r
+ *y_ptr += *aj * xtemp;\r
+ y_ptr += inc_y;\r
+ aj += lda;\r
+ }\r
+\r
+ }\r
+ \r
+ }\r
+ a_ptr += 1; \r
+ }\r
+ return (0);\r
+\r
+}\r
+\r
register __vector double va0_2 = vptr_a0[i + 2];
register __vector double va0_3 = vptr_a0[i + 3];
- vy_0 += va0*vx0_r;
- vy_1 += va0_1*vx0_r;
- vy_2 += va0_2*vx0_r;
- vy_3 += va0_3*vx0_r;
-
- va0 = vec_xxpermdi(va0, va0, 2);
- va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
- va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
- va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
-
- vy_0 += va0*vx0_i;
- vy_1 += va0_1*vx0_i;
- vy_2 += va0_2*vx0_i;
- vy_3 += va0_3*vx0_i;
+ register __vector double va0x = vec_xxpermdi(va0, va0, 2);
+ register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2);
+ register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2);
+ register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2);
+ vy_0 += va0*vx0_r + va0x*vx0_i;
+ vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
+ vy_2 += va0_2*vx0_r + va0x_2*vx0_i;
+ vy_3 += va0_3*vx0_r + va0x_3*vx0_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
i = 0;
n = n << 1;
while (i < n) {
-// __builtin_prefetch(&x[i]);
-// __builtin_prefetch(&a0[i]);
-// __builtin_prefetch(&a1[i]);
-// __builtin_prefetch(&a2[i]);
-// __builtin_prefetch(&a3[i]);
+
register __vector double vx_0 = *(__vector double*) (&x[i]);
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
-rm -f *.o $(UTESTBIN)
libs:
-