fix build error
[platform/upstream/openblas.git] / kernel / mips / zgemv_n_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #undef OP0
32 #undef OP1
33 #undef OP2
34 #undef OP3
35 #undef OP4
36
37 #if !defined(XCONJ)
38     #define OP3  -=
39     #define OP4  +=
40 #else
41     #define OP3  +=
42     #define OP4  -=
43 #endif
44
45 #if !defined(CONJ)
46     #if !defined(XCONJ)
47         #define OP0  -=
48         #define OP1  +=
49         #define OP2  +=
50     #else
51         #define OP0  +=
52         #define OP1  +=
53         #define OP2  -=
54     #endif
55 #else
56     #if !defined(XCONJ)
57         #define OP0  +=
58         #define OP1  -=
59         #define OP2  -=
60     #else
61         #define OP0  -=
62         #define OP1  -=
63         #define OP2  +=
64     #endif
65 #endif
66
67 #define ZGEMV_N_4x4()                        \
68     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
69     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
70     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
71     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
72                                              \
73     PCKEVOD_D2_DP(t1, t0, src0r, src0i);     \
74     PCKEVOD_D2_DP(t3, t2, src1r, src1i);     \
75     PCKEVOD_D2_DP(t5, t4, src2r, src2i);     \
76     PCKEVOD_D2_DP(t7, t6, src3r, src3i);     \
77     PCKEVOD_D2_DP(t9, t8, src4r, src4i);     \
78     PCKEVOD_D2_DP(t11, t10, src5r, src5i);   \
79     PCKEVOD_D2_DP(t13, t12, src6r, src6i);   \
80     PCKEVOD_D2_DP(t15, t14, src7r, src7i);   \
81                                              \
82     y0r += tp0r * src0r;                     \
83     y1r += tp0r * src1r;                     \
84     y0r += tp1r * src2r;                     \
85     y1r += tp1r * src3r;                     \
86     y0r += tp2r * src4r;                     \
87     y1r += tp2r * src5r;                     \
88     y0r += tp3r * src6r;                     \
89     y1r += tp3r * src7r;                     \
90                                              \
91     y0r OP0 tp0i * src0i;                    \
92     y1r OP0 tp0i * src1i;                    \
93     y0r OP0 tp1i * src2i;                    \
94     y1r OP0 tp1i * src3i;                    \
95     y0r OP0 tp2i * src4i;                    \
96     y1r OP0 tp2i * src5i;                    \
97     y0r OP0 tp3i * src6i;                    \
98     y1r OP0 tp3i * src7i;                    \
99                                              \
100     y0i OP1 tp0r * src0i;                    \
101     y1i OP1 tp0r * src1i;                    \
102     y0i OP1 tp1r * src2i;                    \
103     y1i OP1 tp1r * src3i;                    \
104     y0i OP1 tp2r * src4i;                    \
105     y1i OP1 tp2r * src5i;                    \
106     y0i OP1 tp3r * src6i;                    \
107     y1i OP1 tp3r * src7i;                    \
108                                              \
109     y0i OP2 tp0i * src0r;                    \
110     y1i OP2 tp0i * src1r;                    \
111     y0i OP2 tp1i * src2r;                    \
112     y1i OP2 tp1i * src3r;                    \
113     y0i OP2 tp2i * src4r;                    \
114     y1i OP2 tp2i * src5r;                    \
115     y0i OP2 tp3i * src6r;                    \
116     y1i OP2 tp3i * src7r;                    \
117
118 #define ZGEMV_N_2x4()                       \
119     LD_DP2(pa0 + k, 2, t0, t1);             \
120     LD_DP2(pa1 + k, 2, t4, t5);             \
121     LD_DP2(pa2 + k, 2, t8, t9);             \
122     LD_DP2(pa3 + k, 2, t12, t13);           \
123                                             \
124     PCKEVOD_D2_DP(t1, t0, src0r, src0i);    \
125     PCKEVOD_D2_DP(t5, t4, src2r, src2i);    \
126     PCKEVOD_D2_DP(t9, t8, src4r, src4i);    \
127     PCKEVOD_D2_DP(t13, t12, src6r, src6i);  \
128                                             \
129     y0r += tp0r * src0r;                    \
130     y0r += tp1r * src2r;                    \
131     y0r += tp2r * src4r;                    \
132     y0r += tp3r * src6r;                    \
133                                             \
134     y0r OP0 tp0i * src0i;                   \
135     y0r OP0 tp1i * src2i;                   \
136     y0r OP0 tp2i * src4i;                   \
137     y0r OP0 tp3i * src6i;                   \
138                                             \
139     y0i OP1 tp0r * src0i;                   \
140     y0i OP1 tp1r * src2i;                   \
141     y0i OP1 tp2r * src4i;                   \
142     y0i OP1 tp3r * src6i;                   \
143                                             \
144     y0i OP2 tp0i * src0r;                   \
145     y0i OP2 tp1i * src2r;                   \
146     y0i OP2 tp2i * src4r;                   \
147     y0i OP2 tp3i * src6r;                   \
148
149 #define ZGEMV_N_1x4()               \
150     res0 = y[0 * inc_y2];           \
151     res1 = y[0 * inc_y2 + 1];       \
152                                     \
153     res0  += temp0_r * pa0[k];      \
154     res0 OP0 temp0_i * pa0[k + 1];  \
155     res0  += temp1_r * pa1[k];      \
156     res0 OP0 temp1_i * pa1[k + 1];  \
157     res0  += temp2_r * pa2[k];      \
158     res0 OP0 temp2_i * pa2[k + 1];  \
159     res0  += temp3_r * pa3[k];      \
160     res0 OP0 temp3_i * pa3[k + 1];  \
161                                     \
162     res1 OP1 temp0_r * pa0[k + 1];  \
163     res1 OP2 temp0_i * pa0[k];      \
164     res1 OP1 temp1_r * pa1[k + 1];  \
165     res1 OP2 temp1_i * pa1[k];      \
166     res1 OP1 temp2_r * pa2[k + 1];  \
167     res1 OP2 temp2_i * pa2[k];      \
168     res1 OP1 temp3_r * pa3[k + 1];  \
169     res1 OP2 temp3_i * pa3[k];      \
170                                     \
171     y[0 * inc_y2]     = res0;       \
172     y[0 * inc_y2 + 1] = res1;       \
173
174 #define ZGEMV_N_4x2()                     \
175     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);   \
176     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);   \
177                                           \
178     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
179     PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
180     PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
181     PCKEVOD_D2_DP(t7, t6, src3r, src3i);  \
182                                           \
183     y0r += tp0r * src0r;                  \
184     y1r += tp0r * src1r;                  \
185     y0r += tp1r * src2r;                  \
186     y1r += tp1r * src3r;                  \
187                                           \
188     y0r OP0 tp0i * src0i;                 \
189     y1r OP0 tp0i * src1i;                 \
190     y0r OP0 tp1i * src2i;                 \
191     y1r OP0 tp1i * src3i;                 \
192                                           \
193     y0i OP1 tp0r * src0i;                 \
194     y1i OP1 tp0r * src1i;                 \
195     y0i OP1 tp1r * src2i;                 \
196     y1i OP1 tp1r * src3i;                 \
197                                           \
198     y0i OP2 tp0i * src0r;                 \
199     y1i OP2 tp0i * src1r;                 \
200     y0i OP2 tp1i * src2r;                 \
201     y1i OP2 tp1i * src3r;                 \
202
203 #define ZGEMV_N_2x2()                     \
204     LD_DP2(pa0 + k, 2, t0, t1);           \
205     LD_DP2(pa1 + k, 2, t4, t5);           \
206                                           \
207     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
208     PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
209                                           \
210     y0r += tp0r * src0r;                  \
211     y0r += tp1r * src2r;                  \
212                                           \
213     y0r OP0 tp0i * src0i;                 \
214     y0r OP0 tp1i * src2i;                 \
215                                           \
216     y0i OP1 tp0r * src0i;                 \
217     y0i OP1 tp1r * src2i;                 \
218                                           \
219     y0i OP2 tp0i * src0r;                 \
220     y0i OP2 tp1i * src2r;                 \
221
222 #define ZGEMV_N_1x2()               \
223     res0 = y[0 * inc_y2];           \
224     res1 = y[0 * inc_y2 + 1];       \
225                                     \
226     res0  += temp0_r * pa0[k];      \
227     res0 OP0 temp0_i * pa0[k + 1];  \
228     res0  += temp1_r * pa1[k];      \
229     res0 OP0 temp1_i * pa1[k + 1];  \
230                                     \
231     res1 OP1 temp0_r * pa0[k + 1];  \
232     res1 OP2 temp0_i * pa0[k];      \
233     res1 OP1 temp1_r * pa1[k + 1];  \
234     res1 OP2 temp1_i * pa1[k];      \
235                                     \
236     y[0 * inc_y2]     = res0;       \
237     y[0 * inc_y2 + 1] = res1;       \
238
239 #define ZGEMV_N_4x1()                     \
240     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);   \
241                                           \
242     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
243     PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
244                                           \
245     y0r += tp0r * src0r;                  \
246     y1r += tp0r * src1r;                  \
247                                           \
248     y0r OP0 tp0i * src0i;                 \
249     y1r OP0 tp0i * src1i;                 \
250                                           \
251     y0i OP1 tp0r * src0i;                 \
252     y1i OP1 tp0r * src1i;                 \
253                                           \
254     y0i OP2 tp0i * src0r;                 \
255     y1i OP2 tp0i * src1r;                 \
256
257 #define ZGEMV_N_2x1()                     \
258     LD_DP2(pa0 + k, 2, t0, t1);           \
259                                           \
260     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
261                                           \
262     y0r += tp0r * src0r;                  \
263     y0r OP0 tp0i * src0i;                 \
264     y0i OP1 tp0r * src0i;                 \
265     y0i OP2 tp0i * src0r;                 \
266
267 #define ZGEMV_N_1x1()               \
268     res0 = y[0 * inc_y2];           \
269     res1 = y[0 * inc_y2 + 1];       \
270                                     \
271     res0  += temp0_r * pa0[k];      \
272     res0 OP0 temp0_i * pa0[k + 1];  \
273                                     \
274     res1 OP1 temp0_r * pa0[k + 1];  \
275     res1 OP2 temp0_i * pa0[k];      \
276                                     \
277     y[0 * inc_y2]     = res0;       \
278     y[0 * inc_y2 + 1] = res1;       \
279
280 #define ZLOAD_X4_SCALE_VECTOR()       \
281     LD_DP4(x, 2, x0, x1, x2, x3);     \
282                                       \
283     PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
284     PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
285                                       \
286     tp4r   = alphar * x0r;            \
287     tp4r OP3 alphai * x0i;            \
288     tp4i   = alphar * x0i;            \
289     tp4i OP4 alphai * x0r;            \
290                                       \
291     tp5r   = alphar * x1r;            \
292     tp5r OP3 alphai * x1i;            \
293     tp5i   = alphar * x1i;            \
294     tp5i OP4 alphai * x1r;            \
295                                       \
296     SPLATI_D2_DP(tp4r, tp0r, tp1r);   \
297     SPLATI_D2_DP(tp5r, tp2r, tp3r);   \
298     SPLATI_D2_DP(tp4i, tp0i, tp1i);   \
299     SPLATI_D2_DP(tp5i, tp2i, tp3i);   \
300
301 #define ZLOAD_X2_SCALE_VECTOR()       \
302     LD_DP2(x, 2, x0, x1);             \
303                                       \
304     PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
305                                       \
306     tp4r   = alphar * x0r;            \
307     tp4r OP3 alphai * x0i;            \
308     tp4i   = alphar * x0i;            \
309     tp4i OP4 alphai * x0r;            \
310                                       \
311     SPLATI_D2_DP(tp4r, tp0r, tp1r);   \
312     SPLATI_D2_DP(tp4i, tp0i, tp1i);   \
313
314 #define ZLOAD_X4_SCALE_GP()                                                               \
315     x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2)));       \
316     x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((BLASLONG *)(x + 1 * inc_x2)));       \
317     x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 2 * inc_x2)));       \
318     x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((BLASLONG *)(x + 3 * inc_x2)));       \
319     x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2 + 1)));   \
320     x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((BLASLONG *)(x + 1 * inc_x2 + 1)));   \
321     x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 2 * inc_x2 + 1)));   \
322     x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((BLASLONG *)(x + 3 * inc_x2 + 1)));   \
323                                                                                           \
324     tp4r   = alphar * x0r;                                                                \
325     tp4r OP3 alphai * x0i;                                                                \
326     tp4i   = alphar * x0i;                                                                \
327     tp4i OP4 alphai * x0r;                                                                \
328                                                                                           \
329     tp5r   = alphar * x1r;                                                                \
330     tp5r OP3 alphai * x1i;                                                                \
331     tp5i   = alphar * x1i;                                                                \
332     tp5i OP4 alphai * x1r;                                                                \
333                                                                                           \
334     SPLATI_D2_DP(tp4r, tp0r, tp1r);                                                       \
335     SPLATI_D2_DP(tp5r, tp2r, tp3r);                                                       \
336     SPLATI_D2_DP(tp4i, tp0i, tp1i);                                                       \
337     SPLATI_D2_DP(tp5i, tp2i, tp3i);                                                       \
338
339 #define ZLOAD_X2_SCALE_GP()                                                               \
340     x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2)));       \
341     x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((BLASLONG *)(x + 1 * inc_x2)));       \
342     x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2 + 1)));   \
343     x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((BLASLONG *)(x + 1 * inc_x2 + 1)));   \
344                                                                                           \
345     tp4r   = alphar * x0r;                                                                \
346     tp4r OP3 alphai * x0i;                                                                \
347     tp4i   = alphar * x0i;                                                                \
348     tp4i OP4 alphai * x0r;                                                                \
349                                                                                           \
350     SPLATI_D2_DP(tp4r, tp0r, tp1r);                                                       \
351     SPLATI_D2_DP(tp4i, tp0i, tp1i);                                                       \
352
353 #define ZLOAD_X1_SCALE_GP()                         \
354     temp0_r   = alpha_r * x[0 * inc_x2];            \
355     temp0_r OP3 alpha_i * x[0 * inc_x2 + 1];        \
356     temp0_i   = alpha_r * x[0 * inc_x2 + 1];        \
357     temp0_i OP4 alpha_i * x[0 * inc_x2];            \
358                                                     \
359     tp0r = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_r);  \
360     tp0i = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_i);  \
361
362 #define ZLOAD_Y4_VECTOR()             \
363     LD_DP4(y, 2, y0, y1, y2, y3);     \
364     PCKEVOD_D2_DP(y1, y0, y0r, y0i);  \
365     PCKEVOD_D2_DP(y3, y2, y1r, y1i);  \
366
367 #define ZLOAD_Y2_VECTOR()             \
368     LD_DP2(y, 2, y0, y1);             \
369     PCKEVOD_D2_DP(y1, y0, y0r, y0i);  \
370
371 #define ZSTORE_Y4_VECTOR()          \
372     ILVRL_D2_DP(y0i, y0r, y0, y1);  \
373     ILVRL_D2_DP(y1i, y1r, y2, y3);  \
374     ST_DP4(y0, y1, y2, y3, y, 2);   \
375
376 #define ZSTORE_Y2_VECTOR()          \
377     ILVRL_D2_DP(y0i, y0r, y0, y1);  \
378     ST_DP2(y0, y1, y, 2);           \
379
380 #define ZLOAD_Y4_GP()                                                                     \
381     y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y +  0 * inc_y2)));      \
382     y0r = (v2f64) __msa_insert_d((v2i64) y0r,  1, *((BLASLONG *)(y +  1 * inc_y2)));      \
383     y1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y +  2 * inc_y2)));      \
384     y1r = (v2f64) __msa_insert_d((v2i64) y1r,  1, *((BLASLONG *)(y +  3 * inc_y2)));      \
385     y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y +  0 * inc_y2 + 1)));  \
386     y0i = (v2f64) __msa_insert_d((v2i64) y0i,  1, *((BLASLONG *)(y +  1 * inc_y2 + 1)));  \
387     y1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y +  2 * inc_y2 + 1)));  \
388     y1i = (v2f64) __msa_insert_d((v2i64) y1i,  1, *((BLASLONG *)(y +  3 * inc_y2 + 1)));  \
389
390 #define ZLOAD_Y2_GP()                                                                     \
391     y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y +  0 * inc_y2)));      \
392     y0r = (v2f64) __msa_insert_d((v2i64) y0r,  1, *((BLASLONG *)(y +  1 * inc_y2)));      \
393     y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y +  0 * inc_y2 + 1)));  \
394     y0i = (v2f64) __msa_insert_d((v2i64) y0i,  1, *((BLASLONG *)(y +  1 * inc_y2 + 1)));  \
395
396 #define ZSTORE_Y4_GP()                                                     \
397     *((BLASLONG *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0);      \
398     *((BLASLONG *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1);      \
399     *((BLASLONG *)(y + 2 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 0);      \
400     *((BLASLONG *)(y + 3 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 1);      \
401     *((BLASLONG *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0);  \
402     *((BLASLONG *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1);  \
403     *((BLASLONG *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 0);  \
404     *((BLASLONG *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 1);  \
405
406 #define ZSTORE_Y2_GP()                                                     \
407     *((BLASLONG *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0);      \
408     *((BLASLONG *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1);      \
409     *((BLASLONG *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0);  \
410     *((BLASLONG *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1);  \
411
412 #define ZGEMV_N_MSA()                        \
413     for (j = (n >> 2); j--;)                 \
414     {                                        \
415         ZLOAD_X4_SCALE()                     \
416                                              \
417         k = 0;                               \
418         k_pref = pref_offset;                \
419         y = y_org;                           \
420                                              \
421         for (i = (m >> 2); i--;)             \
422         {                                    \
423             PREFETCH(pa0 + k_pref + 8 + 0);  \
424             PREFETCH(pa0 + k_pref + 8 + 4);  \
425             PREFETCH(pa1 + k_pref + 8 + 0);  \
426             PREFETCH(pa1 + k_pref + 8 + 4);  \
427             PREFETCH(pa2 + k_pref + 8 + 0);  \
428             PREFETCH(pa2 + k_pref + 8 + 4);  \
429             PREFETCH(pa3 + k_pref + 8 + 0);  \
430             PREFETCH(pa3 + k_pref + 8 + 4);  \
431                                              \
432             ZLOAD_Y4()                       \
433             ZGEMV_N_4x4()                    \
434             ZSTORE_Y4()                      \
435                                              \
436             k += 2 * 4;                      \
437             k_pref += 2 * 4;                 \
438             y += inc_y2 * 4;                 \
439         }                                    \
440                                              \
441         if (m & 2)                           \
442         {                                    \
443             ZLOAD_Y2()                       \
444             ZGEMV_N_2x4()                    \
445             ZSTORE_Y2()                      \
446                                              \
447             k += 2 * 2;                      \
448             y += inc_y2 * 2;                 \
449         }                                    \
450                                              \
451         if (m & 1)                           \
452         {                                    \
453             temp0_r = tp4r[0];               \
454             temp1_r = tp4r[1];               \
455             temp2_r = tp5r[0];               \
456             temp3_r = tp5r[1];               \
457                                              \
458             temp0_i = tp4i[0];               \
459             temp1_i = tp4i[1];               \
460             temp2_i = tp5i[0];               \
461             temp3_i = tp5i[1];               \
462                                              \
463             ZGEMV_N_1x4()                    \
464             k += 2;                          \
465             y += inc_y2;                     \
466         }                                    \
467                                              \
468         pa0 += 4 * lda2;                     \
469         pa1 += 4 * lda2;                     \
470         pa2 += 4 * lda2;                     \
471         pa3 += 4 * lda2;                     \
472                                              \
473         x += 4 * inc_x2;                     \
474     }                                        \
475                                              \
476     if (n & 2)                               \
477     {                                        \
478         ZLOAD_X2_SCALE()                     \
479                                              \
480         k = 0;                               \
481         y = y_org;                           \
482                                              \
483         for (i = (m >> 2); i--;)             \
484         {                                    \
485             ZLOAD_Y4()                       \
486             ZGEMV_N_4x2()                    \
487             ZSTORE_Y4()                      \
488                                              \
489             k += 2 * 4;                      \
490             y += inc_y2 * 4;                 \
491         }                                    \
492                                              \
493         if (m & 2)                           \
494         {                                    \
495             ZLOAD_Y2()                       \
496             ZGEMV_N_2x2()                    \
497             ZSTORE_Y2()                      \
498                                              \
499             k += 2 * 2;                      \
500             y += inc_y2 * 2;                 \
501         }                                    \
502                                              \
503         if (m & 1)                           \
504         {                                    \
505             temp0_r = tp4r[0];               \
506             temp1_r = tp4r[1];               \
507                                              \
508             temp0_i = tp4i[0];               \
509             temp1_i = tp4i[1];               \
510                                              \
511             ZGEMV_N_1x2()                    \
512                                              \
513             k += 2;                          \
514             y += inc_y2;                     \
515         }                                    \
516                                              \
517         pa0 += 2 * lda2;                     \
518         pa1 += 2 * lda2;                     \
519                                              \
520         x += 2 * inc_x2;                     \
521     }                                        \
522                                              \
523     if (n & 1)                               \
524     {                                        \
525         ZLOAD_X1_SCALE()                     \
526                                              \
527         k = 0;                               \
528         y = y_org;                           \
529                                              \
530         for (i = (m >> 2); i--;)             \
531         {                                    \
532             ZLOAD_Y4()                       \
533             ZGEMV_N_4x1()                    \
534             ZSTORE_Y4()                      \
535                                              \
536             k += 2 * 4;                      \
537             y += inc_y2 * 4;                 \
538         }                                    \
539                                              \
540         if (m & 2)                           \
541         {                                    \
542             ZLOAD_Y2()                       \
543             ZGEMV_N_2x1()                    \
544             ZSTORE_Y2()                      \
545                                              \
546             k += 2 * 2;                      \
547             y += inc_y2 * 2;                 \
548         }                                    \
549                                              \
550         if (m & 1)                           \
551         {                                    \
552             ZGEMV_N_1x1()                    \
553                                              \
554             k += 2;                          \
555             y += inc_y2;                     \
556         }                                    \
557                                              \
558         pa0 += lda2;                         \
559         x += inc_x2;                         \
560     }                                        \
561
562 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
563           FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
564           BLASLONG inc_y2, FLOAT *buffer)
565 {
566     BLASLONG i, j, k, k_pref, pref_offset;
567     FLOAT *y_org = y;
568     FLOAT *pa0, *pa1, *pa2, *pa3;
569     FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i;
570     FLOAT temp3_i, res0, res1;
571     v2f64 alphar, alphai;
572     v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
573     v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i;
574     v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
575     v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
576     v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
577     v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i;
578
579     lda2   = 2 * lda2;
580     inc_x2 = 2 * inc_x2;
581     inc_y2 = 2 * inc_y2;
582
583     pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1);
584     pref_offset = L1_DATA_LINESIZE - pref_offset;
585     pref_offset = pref_offset / sizeof(FLOAT);
586
587     pa0 = A;
588     pa1 = A + lda2;
589     pa2 = A + 2 * lda2;
590     pa3 = A + 3 * lda2;
591
592     alphar = COPY_DOUBLE_TO_VECTOR(alpha_r);
593     alphai = COPY_DOUBLE_TO_VECTOR(alpha_i);
594
595     if ((2 == inc_x2) && (2 == inc_y2))
596     {
597         #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_VECTOR
598         #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_VECTOR
599         #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
600         #define ZLOAD_Y4        ZLOAD_Y4_VECTOR
601         #define ZLOAD_Y2        ZLOAD_Y2_VECTOR
602         #define ZSTORE_Y4       ZSTORE_Y4_VECTOR
603         #define ZSTORE_Y2       ZSTORE_Y2_VECTOR
604
605         ZGEMV_N_MSA();
606
607         #undef ZLOAD_X4_SCALE
608         #undef ZLOAD_X2_SCALE
609         #undef ZLOAD_X1_SCALE
610         #undef ZLOAD_Y4
611         #undef ZLOAD_Y2
612         #undef ZSTORE_Y4
613         #undef ZSTORE_Y2
614     }
615     else if (2 == inc_x2)
616     {
617         #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_VECTOR
618         #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_VECTOR
619         #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
620         #define ZLOAD_Y4        ZLOAD_Y4_GP
621         #define ZLOAD_Y2        ZLOAD_Y2_GP
622         #define ZSTORE_Y4       ZSTORE_Y4_GP
623         #define ZSTORE_Y2       ZSTORE_Y2_GP
624
625         ZGEMV_N_MSA();
626
627         #undef ZLOAD_X4_SCALE
628         #undef ZLOAD_X2_SCALE
629         #undef ZLOAD_X1_SCALE
630         #undef ZLOAD_Y4
631         #undef ZLOAD_Y2
632         #undef ZSTORE_Y4
633         #undef ZSTORE_Y2
634     }
635     else if (2 == inc_y2)
636     {
637         #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_GP
638         #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_GP
639         #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
640         #define ZLOAD_Y4        ZLOAD_Y4_VECTOR
641         #define ZLOAD_Y2        ZLOAD_Y2_VECTOR
642         #define ZSTORE_Y4       ZSTORE_Y4_VECTOR
643         #define ZSTORE_Y2       ZSTORE_Y2_VECTOR
644
645         ZGEMV_N_MSA();
646
647         #undef ZLOAD_X4_SCALE
648         #undef ZLOAD_X2_SCALE
649         #undef ZLOAD_X1_SCALE
650         #undef ZLOAD_Y4
651         #undef ZLOAD_Y2
652         #undef ZSTORE_Y4
653         #undef ZSTORE_Y2
654     }
655     else
656     {
657         #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_GP
658         #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_GP
659         #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
660         #define ZLOAD_Y4        ZLOAD_Y4_GP
661         #define ZLOAD_Y2        ZLOAD_Y2_GP
662         #define ZSTORE_Y4       ZSTORE_Y4_GP
663         #define ZSTORE_Y2       ZSTORE_Y2_GP
664
665         ZGEMV_N_MSA();
666
667         #undef ZLOAD_X4_SCALE
668         #undef ZLOAD_X2_SCALE
669         #undef ZLOAD_X1_SCALE
670         #undef ZLOAD_Y4
671         #undef ZLOAD_Y2
672         #undef ZSTORE_Y4
673         #undef ZSTORE_Y2
674     }
675     return(0);
676 }
677
678 #undef OP0
679 #undef OP1
680 #undef OP2
681 #undef OP3
682 #undef OP4