fix build error
[platform/upstream/openblas.git] / kernel / mips / cgemv_n_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #undef OP0
32 #undef OP1
33 #undef OP2
34 #undef OP3
35 #undef OP4
36
37 #if !defined(XCONJ)
38     #define OP3  -=
39     #define OP4  +=
40 #else
41     #define OP3  +=
42     #define OP4  -=
43 #endif
44
45 #if !defined(CONJ)
46     #if !defined(XCONJ)
47         #define OP0  -=
48         #define OP1  +=
49         #define OP2  +=
50     #else
51         #define OP0  +=
52         #define OP1  +=
53         #define OP2  -=
54     #endif
55 #else
56     #if !defined(XCONJ)
57         #define OP0  +=
58         #define OP1  -=
59         #define OP2  -=
60     #else
61         #define OP0  -=
62         #define OP1  -=
63         #define OP2  +=
64     #endif
65 #endif
66
67 #define CGEMV_N_8x4()                        \
68     LD_SP4(pa0 + k, 4, t0, t1, t2, t3);      \
69     LD_SP4(pa1 + k, 4, t4, t5, t6, t7);      \
70     LD_SP4(pa2 + k, 4, t8, t9, t10, t11);    \
71     LD_SP4(pa3 + k, 4, t12, t13, t14, t15);  \
72                                              \
73     PCKEVOD_W2_SP(t1, t0, src0r, src0i);     \
74     PCKEVOD_W2_SP(t3, t2, src1r, src1i);     \
75     PCKEVOD_W2_SP(t5, t4, src2r, src2i);     \
76     PCKEVOD_W2_SP(t7, t6, src3r, src3i);     \
77     PCKEVOD_W2_SP(t9, t8, src4r, src4i);     \
78     PCKEVOD_W2_SP(t11, t10, src5r, src5i);   \
79     PCKEVOD_W2_SP(t13, t12, src6r, src6i);   \
80     PCKEVOD_W2_SP(t15, t14, src7r, src7i);   \
81                                              \
82     y0r += tp0r * src0r;                     \
83     y1r += tp0r * src1r;                     \
84     y0r += tp1r * src2r;                     \
85     y1r += tp1r * src3r;                     \
86     y0r += tp2r * src4r;                     \
87     y1r += tp2r * src5r;                     \
88     y0r += tp3r * src6r;                     \
89     y1r += tp3r * src7r;                     \
90                                              \
91     y0r OP0 tp0i * src0i;                    \
92     y1r OP0 tp0i * src1i;                    \
93     y0r OP0 tp1i * src2i;                    \
94     y1r OP0 tp1i * src3i;                    \
95     y0r OP0 tp2i * src4i;                    \
96     y1r OP0 tp2i * src5i;                    \
97     y0r OP0 tp3i * src6i;                    \
98     y1r OP0 tp3i * src7i;                    \
99                                              \
100     y0i OP1 tp0r * src0i;                    \
101     y1i OP1 tp0r * src1i;                    \
102     y0i OP1 tp1r * src2i;                    \
103     y1i OP1 tp1r * src3i;                    \
104     y0i OP1 tp2r * src4i;                    \
105     y1i OP1 tp2r * src5i;                    \
106     y0i OP1 tp3r * src6i;                    \
107     y1i OP1 tp3r * src7i;                    \
108                                              \
109     y0i OP2 tp0i * src0r;                    \
110     y1i OP2 tp0i * src1r;                    \
111     y0i OP2 tp1i * src2r;                    \
112     y1i OP2 tp1i * src3r;                    \
113     y0i OP2 tp2i * src4r;                    \
114     y1i OP2 tp2i * src5r;                    \
115     y0i OP2 tp3i * src6r;                    \
116     y1i OP2 tp3i * src7r;                    \
117
118 #define CGEMV_N_4x4()                       \
119     LD_SP2(pa0 + k, 4, t0, t1);             \
120     LD_SP2(pa1 + k, 4, t4, t5);             \
121     LD_SP2(pa2 + k, 4, t8, t9);             \
122     LD_SP2(pa3 + k, 4, t12, t13);           \
123                                             \
124     PCKEVOD_W2_SP(t1, t0, src0r, src0i);    \
125     PCKEVOD_W2_SP(t5, t4, src2r, src2i);    \
126     PCKEVOD_W2_SP(t9, t8, src4r, src4i);    \
127     PCKEVOD_W2_SP(t13, t12, src6r, src6i);  \
128                                             \
129     y0r += tp0r * src0r;                    \
130     y0r += tp1r * src2r;                    \
131     y0r += tp2r * src4r;                    \
132     y0r += tp3r * src6r;                    \
133                                             \
134     y0r OP0 tp0i * src0i;                   \
135     y0r OP0 tp1i * src2i;                   \
136     y0r OP0 tp2i * src4i;                   \
137     y0r OP0 tp3i * src6i;                   \
138                                             \
139     y0i OP1 tp0r * src0i;                   \
140     y0i OP1 tp1r * src2i;                   \
141     y0i OP1 tp2r * src4i;                   \
142     y0i OP1 tp3r * src6i;                   \
143                                             \
144     y0i OP2 tp0i * src0r;                   \
145     y0i OP2 tp1i * src2r;                   \
146     y0i OP2 tp2i * src4r;                   \
147     y0i OP2 tp3i * src6r;                   \
148
149 #define CGEMV_N_1x4()               \
150     res0 = y[0 * inc_y2];           \
151     res1 = y[0 * inc_y2 + 1];       \
152                                     \
153     res0  += temp0_r * pa0[k];      \
154     res0 OP0 temp0_i * pa0[k + 1];  \
155     res0  += temp1_r * pa1[k];      \
156     res0 OP0 temp1_i * pa1[k + 1];  \
157     res0  += temp2_r * pa2[k];      \
158     res0 OP0 temp2_i * pa2[k + 1];  \
159     res0  += temp3_r * pa3[k];      \
160     res0 OP0 temp3_i * pa3[k + 1];  \
161                                     \
162     res1 OP1 temp0_r * pa0[k + 1];  \
163     res1 OP2 temp0_i * pa0[k];      \
164     res1 OP1 temp1_r * pa1[k + 1];  \
165     res1 OP2 temp1_i * pa1[k];      \
166     res1 OP1 temp2_r * pa2[k + 1];  \
167     res1 OP2 temp2_i * pa2[k];      \
168     res1 OP1 temp3_r * pa3[k + 1];  \
169     res1 OP2 temp3_i * pa3[k];      \
170                                     \
171     y[0 * inc_y2]     = res0;       \
172     y[0 * inc_y2 + 1] = res1;       \
173
174 #define CGEMV_N_8x2()                     \
175     LD_SP4(pa0 + k, 4, t0, t1, t2, t3);   \
176     LD_SP4(pa1 + k, 4, t4, t5, t6, t7);   \
177                                           \
178     PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
179     PCKEVOD_W2_SP(t3, t2, src1r, src1i);  \
180     PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
181     PCKEVOD_W2_SP(t7, t6, src3r, src3i);  \
182                                           \
183     y0r += tp0r * src0r;                  \
184     y1r += tp0r * src1r;                  \
185     y0r += tp1r * src2r;                  \
186     y1r += tp1r * src3r;                  \
187                                           \
188     y0r OP0 tp0i * src0i;                 \
189     y1r OP0 tp0i * src1i;                 \
190     y0r OP0 tp1i * src2i;                 \
191     y1r OP0 tp1i * src3i;                 \
192                                           \
193     y0i OP1 tp0r * src0i;                 \
194     y1i OP1 tp0r * src1i;                 \
195     y0i OP1 tp1r * src2i;                 \
196     y1i OP1 tp1r * src3i;                 \
197                                           \
198     y0i OP2 tp0i * src0r;                 \
199     y1i OP2 tp0i * src1r;                 \
200     y0i OP2 tp1i * src2r;                 \
201     y1i OP2 tp1i * src3r;                 \
202
203 #define CGEMV_N_4x2()                     \
204     LD_SP2(pa0 + k, 4, t0, t1);           \
205     LD_SP2(pa1 + k, 4, t4, t5);           \
206                                           \
207     PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
208     PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
209                                           \
210     y0r += tp0r * src0r;                  \
211     y0r += tp1r * src2r;                  \
212                                           \
213     y0r OP0 tp0i * src0i;                 \
214     y0r OP0 tp1i * src2i;                 \
215                                           \
216     y0i OP1 tp0r * src0i;                 \
217     y0i OP1 tp1r * src2i;                 \
218                                           \
219     y0i OP2 tp0i * src0r;                 \
220     y0i OP2 tp1i * src2r;                 \
221
222 #define CGEMV_N_1x2()               \
223     res0 = y[0 * inc_y2];           \
224     res1 = y[0 * inc_y2 + 1];       \
225                                     \
226     res0  += temp0_r * pa0[k];      \
227     res0 OP0 temp0_i * pa0[k + 1];  \
228     res0  += temp1_r * pa1[k];      \
229     res0 OP0 temp1_i * pa1[k + 1];  \
230                                     \
231     res1 OP1 temp0_r * pa0[k + 1];  \
232     res1 OP2 temp0_i * pa0[k];      \
233     res1 OP1 temp1_r * pa1[k + 1];  \
234     res1 OP2 temp1_i * pa1[k];      \
235                                     \
236     y[0 * inc_y2]     = res0;       \
237     y[0 * inc_y2 + 1] = res1;       \
238
239 #define CGEMV_N_1x1()              \
240     res0 = y[0 * inc_y2];          \
241     res1 = y[0 * inc_y2 + 1];      \
242                                    \
243     res0  += temp_r * pa0[k];      \
244     res0 OP0 temp_i * pa0[k + 1];  \
245                                    \
246     res1 OP1 temp_r * pa0[k + 1];  \
247     res1 OP2 temp_i * pa0[k];      \
248                                    \
249     y[0 * inc_y2]     = res0;      \
250     y[0 * inc_y2 + 1] = res1;      \
251
252 #define CLOAD_X4_SCALE_VECTOR()                  \
253     LD_SP2(x, 4, x0, x1);                        \
254                                                  \
255     PCKEVOD_W2_SP(x1, x0, x0r, x0i);             \
256                                                  \
257     tp4r   = alphar * x0r;                       \
258     tp4r OP3 alphai * x0i;                       \
259     tp4i   = alphar * x0i;                       \
260     tp4i OP4 alphai * x0r;                       \
261                                                  \
262     SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r);  \
263     SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i);  \
264
265 #define CLOAD_X4_SCALE_GP()                                                          \
266     x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2)));      \
267     x0r = (v4f32) __msa_insert_w((v4i32) x0r,  1, *((int *) (x + 1 * inc_x2)));      \
268     x0r = (v4f32) __msa_insert_w((v4i32) x0r,  2, *((int *) (x + 2 * inc_x2)));      \
269     x0r = (v4f32) __msa_insert_w((v4i32) x0r,  3, *((int *) (x + 3 * inc_x2)));      \
270     x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1)));  \
271     x0i = (v4f32) __msa_insert_w((v4i32) x0i,  1, *((int *) (x + 1 * inc_x2 + 1)));  \
272     x0i = (v4f32) __msa_insert_w((v4i32) x0i,  2, *((int *) (x + 2 * inc_x2 + 1)));  \
273     x0i = (v4f32) __msa_insert_w((v4i32) x0i,  3, *((int *) (x + 3 * inc_x2 + 1)));  \
274                                                                                      \
275     tp4r   = alphar * x0r;                                                           \
276     tp4r OP3 alphai * x0i;                                                           \
277     tp4i   = alphar * x0i;                                                           \
278     tp4i OP4 alphai * x0r;                                                           \
279                                                                                      \
280     SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r);                                      \
281     SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i);                                      \
282
283 #define CLOAD_X2_SCALE_GP()                        \
284     temp0_r   = alpha_r * x[0 * inc_x2];           \
285     temp0_r OP3 alpha_i * x[0 * inc_x2 + 1];       \
286     temp0_i   = alpha_r * x[0 * inc_x2 + 1];       \
287     temp0_i OP4 alpha_i * x[0 * inc_x2];           \
288                                                    \
289     temp1_r   = alpha_r * x[1 * inc_x2];           \
290     temp1_r OP3 alpha_i * x[1 * inc_x2 + 1];       \
291     temp1_i   = alpha_r * x[1 * inc_x2 + 1];       \
292     temp1_i OP4 alpha_i * x[1 * inc_x2];           \
293                                                    \
294     tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r);  \
295     tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i);  \
296     tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r);  \
297     tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i);  \
298
299 #define CLOAD_X1_SCALE_GP()                  \
300     temp_r   = alpha_r * x[0 * inc_x2];      \
301     temp_r OP3 alpha_i * x[0 * inc_x2 + 1];  \
302     temp_i   = alpha_r * x[0 * inc_x2 + 1];  \
303     temp_i OP4 alpha_i * x[0 * inc_x2];      \
304
305 #define CLOAD_Y8_VECTOR()             \
306     LD_SP4(y, 4, y0, y1, y2, y3);     \
307     PCKEVOD_W2_SP(y1, y0, y0r, y0i);  \
308     PCKEVOD_W2_SP(y3, y2, y1r, y1i);  \
309
310 #define CLOAD_Y4_VECTOR()             \
311     LD_SP2(y, 4, y0, y1);             \
312     PCKEVOD_W2_SP(y1, y0, y0r, y0i);  \
313
314 #define CSTORE_Y8_VECTOR()          \
315     ILVRL_W2_SP(y0i, y0r, y0, y1);  \
316     ILVRL_W2_SP(y1i, y1r, y2, y3);  \
317     ST_SP4(y0, y1, y2, y3, y, 4);   \
318
319 #define CSTORE_Y4_VECTOR()          \
320     ILVRL_W2_SP(y0i, y0r, y0, y1);  \
321     ST_SP2(y0, y1, y, 4);           \
322
323 #define CLOAD_Y8_GP()                                                               \
324     y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2)));      \
325     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  1, *((int *)(y + 1 * inc_y2)));      \
326     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  2, *((int *)(y + 2 * inc_y2)));      \
327     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  3, *((int *)(y + 3 * inc_y2)));      \
328     y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2)));      \
329     y1r = (v4f32) __msa_insert_w((v4i32) y1r,  1, *((int *)(y + 5 * inc_y2)));      \
330     y1r = (v4f32) __msa_insert_w((v4i32) y1r,  2, *((int *)(y + 6 * inc_y2)));      \
331     y1r = (v4f32) __msa_insert_w((v4i32) y1r,  3, *((int *)(y + 7 * inc_y2)));      \
332     y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1)));  \
333     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  1, *((int *)(y + 1 * inc_y2 + 1)));  \
334     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  2, *((int *)(y + 2 * inc_y2 + 1)));  \
335     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  3, *((int *)(y + 3 * inc_y2 + 1)));  \
336     y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1)));  \
337     y1i = (v4f32) __msa_insert_w((v4i32) y1i,  1, *((int *)(y + 5 * inc_y2 + 1)));  \
338     y1i = (v4f32) __msa_insert_w((v4i32) y1i,  2, *((int *)(y + 6 * inc_y2 + 1)));  \
339     y1i = (v4f32) __msa_insert_w((v4i32) y1i,  3, *((int *)(y + 7 * inc_y2 + 1)));  \
340
341 #define CLOAD_Y4_GP()                                                                \
342     y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y +  0 * inc_y2)));      \
343     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  1, *((int *)(y +  1 * inc_y2)));      \
344     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  2, *((int *)(y +  2 * inc_y2)));      \
345     y0r = (v4f32) __msa_insert_w((v4i32) y0r,  3, *((int *)(y +  3 * inc_y2)));      \
346     y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y +  0 * inc_y2 + 1)));  \
347     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  1, *((int *)(y +  1 * inc_y2 + 1)));  \
348     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  2, *((int *)(y +  2 * inc_y2 + 1)));  \
349     y0i = (v4f32) __msa_insert_w((v4i32) y0i,  3, *((int *)(y +  3 * inc_y2 + 1)));  \
350
351 #define CSTORE_Y8_GP()                                                \
352     *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0);      \
353     *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1);      \
354     *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2);      \
355     *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3);      \
356     *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0);      \
357     *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1);      \
358     *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2);      \
359     *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3);      \
360     *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0);  \
361     *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1);  \
362     *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2);  \
363     *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3);  \
364     *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0);  \
365     *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1);  \
366     *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2);  \
367     *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3);  \
368
369 #define CSTORE_Y4_GP()                                                \
370     *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0);      \
371     *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1);      \
372     *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2);      \
373     *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3);      \
374     *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0);  \
375     *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1);  \
376     *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2);  \
377     *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3);  \
378
379 #define CGEMV_N_MSA()                         \
380     for (j = (n >> 2); j--;)                  \
381     {                                         \
382         CLOAD_X4_SCALE();                     \
383                                               \
384         k = 0;                                \
385         k_pref = pref_offset;                 \
386         y = y_org;                            \
387                                               \
388         for (i = (m >> 3); i--;)              \
389         {                                     \
390             PREFETCH(pa0 + k_pref + 16 + 0);  \
391             PREFETCH(pa0 + k_pref + 16 + 8);  \
392             PREFETCH(pa1 + k_pref + 16 + 0);  \
393             PREFETCH(pa1 + k_pref + 16 + 8);  \
394             PREFETCH(pa2 + k_pref + 16 + 0);  \
395             PREFETCH(pa2 + k_pref + 16 + 8);  \
396             PREFETCH(pa3 + k_pref + 16 + 0);  \
397             PREFETCH(pa3 + k_pref + 16 + 8);  \
398                                               \
399             CLOAD_Y8()                        \
400             CGEMV_N_8x4();                    \
401             CSTORE_Y8();                      \
402                                               \
403             k += 2 * 8;                       \
404             k_pref += 2 * 8;                  \
405             y += inc_y2 * 8;                  \
406         }                                     \
407                                               \
408         if (m & 4)                            \
409         {                                     \
410             CLOAD_Y4();                       \
411             CGEMV_N_4x4();                    \
412             CSTORE_Y4();                      \
413                                               \
414             k += 2 * 4;                       \
415             y += inc_y2 * 4;                  \
416         }                                     \
417                                               \
418         if (m & 3)                            \
419         {                                     \
420             temp0_r = tp4r[0];                \
421             temp1_r = tp4r[1];                \
422             temp2_r = tp4r[2];                \
423             temp3_r = tp4r[3];                \
424                                               \
425             temp0_i = tp4i[0];                \
426             temp1_i = tp4i[1];                \
427             temp2_i = tp4i[2];                \
428             temp3_i = tp4i[3];                \
429                                               \
430             for (i = (m & 3); i--;)           \
431             {                                 \
432                 CGEMV_N_1x4();                \
433                                               \
434                 k += 2;                       \
435                 y += inc_y2;                  \
436             }                                 \
437         }                                     \
438                                               \
439         pa0 += 4 * lda2;                      \
440         pa1 += 4 * lda2;                      \
441         pa2 += 4 * lda2;                      \
442         pa3 += 4 * lda2;                      \
443                                               \
444         x += 4 * inc_x2;                      \
445     }                                         \
446                                               \
447     if (n & 2)                                \
448     {                                         \
449         CLOAD_X2_SCALE();                     \
450                                               \
451         k = 0;                                \
452         y = y_org;                            \
453                                               \
454         for (i = (m >> 3); i--;)              \
455         {                                     \
456             CLOAD_Y8();                       \
457             CGEMV_N_8x2();                    \
458             CSTORE_Y8();                      \
459                                               \
460             k += 2 * 8;                       \
461             y += inc_y2 * 8;                  \
462         }                                     \
463                                               \
464         if (m & 4)                            \
465         {                                     \
466             CLOAD_Y4();                       \
467             CGEMV_N_4x2();                    \
468             CSTORE_Y4();                      \
469                                               \
470             k += 2 * 4;                       \
471             y += inc_y2 * 4;                  \
472         }                                     \
473                                               \
474         for (i = (m & 3); i--;)               \
475         {                                     \
476              CGEMV_N_1x2();                   \
477                                               \
478              k += 2;                          \
479              y += inc_y2;                     \
480         }                                     \
481                                               \
482         pa0 += 2 * lda2;                      \
483         pa1 += 2 * lda2;                      \
484                                               \
485         x += 2 * inc_x2;                      \
486     }                                         \
487                                               \
488     if (n & 1)                                \
489     {                                         \
490         CLOAD_X1_SCALE();                     \
491                                               \
492         k = 0;                                \
493         y = y_org;                            \
494                                               \
495         for (i = m; i--;)                     \
496         {                                     \
497             CGEMV_N_1x1();                    \
498                                               \
499             k += 2;                           \
500             y += inc_y2;                      \
501         }                                     \
502                                               \
503         pa0 += lda2;                          \
504         x += inc_x2;                          \
505     }                                         \
506
507 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
508           FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
509           BLASLONG inc_y2, FLOAT *buffer)
510 {
511     BLASLONG i, j, k, k_pref, pref_offset;
512     FLOAT *y_org = y;
513     FLOAT *pa0, *pa1, *pa2, *pa3;
514     FLOAT temp_r, temp_i, res0, res1, temp0_r;
515     FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
516     v4f32 alphar, alphai;
517     v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
518     v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
519     v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
520     v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
521     v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
522
523     lda2 = 2 * lda2;
524     inc_x2 = 2 * inc_x2;
525     inc_y2 = 2 * inc_y2;
526
527     pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1);
528     pref_offset = L1_DATA_LINESIZE - pref_offset;
529     pref_offset = pref_offset / sizeof(FLOAT);
530
531     pa0 = A;
532     pa1 = A + lda2;
533     pa2 = A + 2 * lda2;
534     pa3 = A + 3 * lda2;
535
536     alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
537     alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
538
539     if ((2 == inc_x2) && (2 == inc_y2))
540     {
541         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_VECTOR
542         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
543         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
544         #define CLOAD_Y8        CLOAD_Y8_VECTOR
545         #define CLOAD_Y4        CLOAD_Y4_VECTOR
546         #define CSTORE_Y8       CSTORE_Y8_VECTOR
547         #define CSTORE_Y4       CSTORE_Y4_VECTOR
548
549         CGEMV_N_MSA();
550
551         #undef CLOAD_X4_SCALE
552         #undef CLOAD_X2_SCALE
553         #undef CLOAD_X1_SCALE
554         #undef CLOAD_Y8
555         #undef CLOAD_Y4
556         #undef CSTORE_Y8
557         #undef CSTORE_Y4
558     }
559     else if (2 == inc_x2)
560     {
561         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_VECTOR
562         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
563         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
564         #define CLOAD_Y8         CLOAD_Y8_GP
565         #define CLOAD_Y4         CLOAD_Y4_GP
566         #define CSTORE_Y8        CSTORE_Y8_GP
567         #define CSTORE_Y4        CSTORE_Y4_GP
568
569         CGEMV_N_MSA();
570
571         #undef CLOAD_X4_SCALE
572         #undef CLOAD_X2_SCALE
573         #undef CLOAD_X1_SCALE
574         #undef CLOAD_Y8
575         #undef CLOAD_Y4
576         #undef CSTORE_Y8
577         #undef CSTORE_Y4
578     }
579     else if (2 == inc_y2)
580     {
581         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_GP
582         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
583         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
584         #define CLOAD_Y8        CLOAD_Y8_VECTOR
585         #define CLOAD_Y4        CLOAD_Y4_VECTOR
586         #define CSTORE_Y8       CSTORE_Y8_VECTOR
587         #define CSTORE_Y4       CSTORE_Y4_VECTOR
588
589         CGEMV_N_MSA();
590
591         #undef CLOAD_X4_SCALE
592         #undef CLOAD_X2_SCALE
593         #undef CLOAD_X1_SCALE
594         #undef CLOAD_Y8
595         #undef CLOAD_Y4
596         #undef CSTORE_Y8
597         #undef CSTORE_Y4
598     }
599     else
600     {
601         #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_GP
602         #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
603         #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
604         #define CLOAD_Y8        CLOAD_Y8_GP
605         #define CLOAD_Y4        CLOAD_Y4_GP
606         #define CSTORE_Y8       CSTORE_Y8_GP
607         #define CSTORE_Y4       CSTORE_Y4_GP
608
609         CGEMV_N_MSA();
610
611         #undef CLOAD_X4_SCALE
612         #undef CLOAD_X2_SCALE
613         #undef CLOAD_X1_SCALE
614         #undef CLOAD_Y8
615         #undef CLOAD_Y4
616         #undef CSTORE_Y8
617         #undef CSTORE_Y4
618     }
619     return(0);
620 }
621
622 #undef OP0
623 #undef OP1
624 #undef OP2
625 #undef OP3
626 #undef OP4