fix build error
[platform/upstream/openblas.git] / kernel / mips / zgemv_t_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #undef OP0
32 #undef OP1
33 #undef OP2
34 #undef OP3
35 #undef OP4
36
37 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
38     #define OP0  -=
39     #define OP1  +=
40     #define OP2  +=
41 #else
42     #define OP0  +=
43     #define OP1  +=
44     #define OP2  -=
45 #endif
46
47 #define ZGEMV_T_8x1()                     \
48     LD_DP4(pa0, 2, t0, t1, t2, t3);       \
49     LD_DP4(pa0 + 8, 2, t4, t5, t6, t7);   \
50                                           \
51     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
52     PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
53     PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
54     PCKEVOD_D2_DP(t7, t6, src3r, src3i);  \
55                                           \
56     tp0r += src0r * x0r;                  \
57     tp0i OP1 src0r * x0i;                 \
58     tp0r OP0 src0i * x0i;                 \
59     tp0i OP2 src0i * x0r;                 \
60                                           \
61     tp0r += src2r * x2r;                  \
62     tp0i OP1 src2r * x2i;                 \
63     tp0r OP0 src2i * x2i;                 \
64     tp0i OP2 src2i * x2r;                 \
65                                           \
66     tp0r += src1r * x1r;                  \
67     tp0i OP1 src1r * x1i;                 \
68     tp0r OP0 src1i * x1i;                 \
69     tp0i OP2 src1i * x1r;                 \
70                                           \
71     tp0r += src3r * x3r;                  \
72     tp0i OP1 src3r * x3i;                 \
73     tp0r OP0 src3i * x3i;                 \
74     tp0i OP2 src3i * x3r;                 \
75
76 #define ZGEMV_T_4x1()                     \
77     LD_DP4(pa0, 2, t0, t1, t2, t3);       \
78                                           \
79     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
80     PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
81                                           \
82     tp0r += src0r * x0r;                  \
83     tp0r += src1r * x1r;                  \
84     tp0r OP0 src0i * x0i;                 \
85     tp0r OP0 src1i * x1i;                 \
86                                           \
87     tp0i OP1 src0r * x0i;                 \
88     tp0i OP1 src1r * x1i;                 \
89     tp0i OP2 src0i * x0r;                 \
90     tp0i OP2 src1i * x1r;                 \
91
92 #define ZGEMV_T_2x1()                     \
93     LD_DP2(pa0, 2, t0, t1);               \
94                                           \
95     PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
96                                           \
97     tp0r += src0r * x0r;                  \
98     tp0r OP0 src0i * x0i;                 \
99                                           \
100     tp0i OP1 src0r * x0i;                 \
101     tp0i OP2 src0i * x0r;                 \
102
103 #define ZGEMV_T_1x1()                       \
104     temp0r  += pa0[0] * x[0 * inc_x2];      \
105     temp0r OP0 pa0[1] * x[0 * inc_x2 + 1];  \
106                                             \
107     temp0i OP1 pa0[0] * x[0 * inc_x2 + 1];  \
108     temp0i OP2 pa0[1] * x[0 * inc_x2];      \
109
110 #define ZSCALE_STORE_Y1_GP()    \
111     res0r = y[0 * inc_y2];      \
112     res0i = y[0 * inc_y2 + 1];  \
113                                 \
114     res0r  += alphar * temp0r;  \
115     res0r OP0 alphai * temp0i;  \
116                                 \
117     res0i OP1 alphar * temp0i;  \
118     res0i OP2 alphai * temp0r;  \
119                                 \
120     y[0 * inc_y2] = res0r;      \
121     y[0 * inc_y2 + 1] = res0i;  \
122
123 #define ZLOAD_X8_VECTOR()             \
124     LD_DP4(x, 2, x0, x1, x2, x3);     \
125     LD_DP4(x + 8, 2, x4, x5, x6, x7); \
126                                       \
127     PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
128     PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
129     PCKEVOD_D2_DP(x5, x4, x2r, x2i);  \
130     PCKEVOD_D2_DP(x7, x6, x3r, x3i);  \
131
132 #define ZLOAD_X4_VECTOR()             \
133     LD_DP4(x, 2, x0, x1, x2, x3);     \
134     PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
135     PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
136
137 #define ZLOAD_X2_VECTOR()             \
138     LD_DP2(x, 2, x0, x1);             \
139     PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
140
141 #define ZLOAD_X8_GP()                                                                      \
142     x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
143     x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
144     x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2)));      \
145     x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((long long *) (x + 3 * inc_x2)));      \
146     x2r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 4 * inc_x2)));      \
147     x2r = (v2f64) __msa_insert_d((v2i64) x2r,  1, *((long long *) (x + 5 * inc_x2)));      \
148     x3r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 6 * inc_x2)));      \
149     x3r = (v2f64) __msa_insert_d((v2i64) x3r,  1, *((long long *) (x + 7 * inc_x2)));      \
150     x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
151     x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
152     x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1)));  \
153     x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((long long *) (x + 3 * inc_x2 + 1)));  \
154     x2i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 4 * inc_x2 + 1)));  \
155     x2i = (v2f64) __msa_insert_d((v2i64) x2i,  1, *((long long *) (x + 5 * inc_x2 + 1)));  \
156     x3i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 6 * inc_x2 + 1)));  \
157     x3i = (v2f64) __msa_insert_d((v2i64) x3i,  1, *((long long *) (x + 7 * inc_x2 + 1)));  \
158
159 #define ZLOAD_X4_GP()                                                                      \
160     x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
161     x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
162     x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2)));      \
163     x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((long long *) (x + 3 * inc_x2)));      \
164     x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
165     x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
166     x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1)));  \
167     x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((long long *) (x + 3 * inc_x2 + 1)));  \
168
169 #define ZLOAD_X2_GP()                                                                      \
170     x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
171     x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
172     x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
173     x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
174
175 #define ZGEMV_T_MSA()                                                   \
176     for (j = n; j--;)                                                   \
177     {                                                                   \
178         tp0r = zero;                                                    \
179         tp0i = zero;                                                    \
180         tp1r = zero;                                                    \
181         tp1i = zero;                                                    \
182         tp2r = zero;                                                    \
183         tp2i = zero;                                                    \
184         tp3r = zero;                                                    \
185         tp3i = zero;                                                    \
186                                                                         \
187         pa0 = A;                                                        \
188         x = srcx_org;                                                   \
189                                                                         \
190         if (m >> 4)                                                     \
191         {                                                               \
192             x0 = LD_DP(x);                                              \
193             x1 = LD_DP(x + 1 * inc_x2);                                 \
194             t0 = LD_DP(pa0);                                            \
195             t1 = LD_DP(pa0 + 2);                                        \
196                                                                         \
197             x4 = LD_DP(x + 4 * inc_x2);                                 \
198             x5 = LD_DP(x + 5 * inc_x2);                                 \
199             t4 = LD_DP(pa0 + 8);                                        \
200             t5 = LD_DP(pa0 + 10);                                       \
201                                                                         \
202             for (i = (m >> 4) - 1; i--;)                                \
203             {                                                           \
204                 pa0_pref = pa0 + pref_offset;                           \
205                                                                         \
206                 PREFETCH(pa0_pref + 36);                                \
207                 PREFETCH(pa0_pref + 44);                                \
208                 PREFETCH(pa0_pref + 48);                                \
209                 PREFETCH(pa0_pref + 52);                                \
210                 PREFETCH(pa0_pref + 56);                                \
211                 PREFETCH(pa0_pref + 60);                                \
212                 PREFETCH(pa0_pref + 64);                                \
213                 PREFETCH(pa0_pref + 72);                                \
214                                                                         \
215                 x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);    \
216                 x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);    \
217                 src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);  \
218                 src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);  \
219                                                                         \
220                 tp0r += src0r * x0r;                                    \
221                 x2 = LD_DP(x + 2 * inc_x2);                             \
222                 x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);    \
223                                                                         \
224                 tp0i OP1 src0r * x0i;                                   \
225                 x3 = LD_DP(x + 3 * inc_x2);                             \
226                 x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);    \
227                                                                         \
228                 tp1r OP0 src0i * x0i;                                   \
229                 t2 = LD_DP(pa0 + 4);                                    \
230                 src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);  \
231                                                                         \
232                 tp1i OP2 src0i * x0r;                                   \
233                 t3 = LD_DP(pa0 + 6);                                    \
234                 src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);  \
235                                                                         \
236                 tp2r += src2r * x2r;                                    \
237                 x6 = LD_DP(x + 6 * inc_x2);                             \
238                                                                         \
239                 tp2i OP1 src2r * x2i;                                   \
240                 x7 = LD_DP(x + 7 * inc_x2);                             \
241                                                                         \
242                 tp3r OP0 src2i * x2i;                                   \
243                 t6 = LD_DP(pa0 + 12);                                   \
244                                                                         \
245                 tp3i OP2 src2i * x2r;                                   \
246                 t7 = LD_DP(pa0 + 14);                                   \
247                                                                         \
248                 x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);    \
249                 x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);    \
250                 src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);  \
251                 src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);  \
252                                                                         \
253                 tp0r += src1r * x1r;                                    \
254                 x0 = LD_DP(x +  8 * inc_x2);                            \
255                 x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);    \
256                                                                         \
257                 tp0i OP1 src1r * x1i;                                   \
258                 x1 = LD_DP(x +  9 * inc_x2);                            \
259                 x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);    \
260                                                                         \
261                 tp1r OP0 src1i * x1i;                                   \
262                 t0 = LD_DP(pa0 + 16);                                   \
263                 src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);  \
264                                                                         \
265                 tp1i OP2 src1i * x1r;                                   \
266                 t1 = LD_DP(pa0 + 18);                                   \
267                 src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);  \
268                                                                         \
269                 tp2r += src3r * x3r;                                    \
270                 x4 = LD_DP(x + 12 * inc_x2);                            \
271                                                                         \
272                 tp2i OP1 src3r * x3i;                                   \
273                 x5 = LD_DP(x + 13 * inc_x2);                            \
274                                                                         \
275                 tp3r OP0 src3i * x3i;                                   \
276                 t4 = LD_DP(pa0 + 24);                                   \
277                                                                         \
278                 tp3i OP2 src3i * x3r;                                   \
279                 t5 = LD_DP(pa0 + 26);                                   \
280                                                                         \
281                 x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);    \
282                 x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);    \
283                 src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);  \
284                 src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);  \
285                                                                         \
286                 tp0r += src0r * x0r;                                    \
287                 x2 = LD_DP(x + 10 * inc_x2);                            \
288                 x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);    \
289                                                                         \
290                 tp0i OP1 src0r * x0i;                                   \
291                 x3 = LD_DP(x + 11 * inc_x2);                            \
292                 x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);    \
293                                                                         \
294                 tp1r OP0 src0i * x0i;                                   \
295                 t2 = LD_DP(pa0 + 20);                                   \
296                 src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);  \
297                                                                         \
298                 tp1i OP2 src0i * x0r;                                   \
299                 t3 = LD_DP(pa0 + 22);                                   \
300                 src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);  \
301                                                                         \
302                 tp2r += src2r * x2r;                                    \
303                 x6 = LD_DP(x + 14 * inc_x2);                            \
304                                                                         \
305                 tp2i OP1 src2r * x2i;                                   \
306                 x7 = LD_DP(x + 15 * inc_x2);                            \
307                                                                         \
308                 tp3r OP0 src2i * x2i;                                   \
309                 t6 = LD_DP(pa0 + 28);                                   \
310                                                                         \
311                 tp3i OP2 src2i * x2r;                                   \
312                 t7 = LD_DP(pa0 + 30);                                   \
313                                                                         \
314                 x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);    \
315                 x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);    \
316                 src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);  \
317                 src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);  \
318                                                                         \
319                 tp0r += src1r * x1r;                                    \
320                 x0 = LD_DP(x + inc_x2 * 16);                            \
321                 x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);    \
322                                                                         \
323                 tp0i OP1 src1r * x1i;                                   \
324                 x1 = LD_DP(x + inc_x2 * 16 + 1 * inc_x2);               \
325                 x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);    \
326                                                                         \
327                 tp1r OP0 src1i * x1i;                                   \
328                 t0 = LD_DP(pa0 + 2 * 16);                               \
329                 src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);  \
330                                                                         \
331                 tp1i OP2 src1i * x1r;                                   \
332                 t1 = LD_DP(pa0 + 2 * 16 + 2);                           \
333                 src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);  \
334                                                                         \
335                 tp2r += src3r * x3r;                                    \
336                 x4 = LD_DP(x + inc_x2 * 16 + 4 * inc_x2);               \
337                                                                         \
338                 tp2i OP1 src3r * x3i;                                   \
339                 x5 = LD_DP(x + inc_x2 * 16 + 5 * inc_x2);               \
340                                                                         \
341                 tp3r OP0 src3i * x3i;                                   \
342                 t4 = LD_DP(pa0 + 2 * 16 + 8);                           \
343                                                                         \
344                 tp3i OP2 src3i * x3r;                                   \
345                 t5 = LD_DP(pa0 + 2 * 16 + 10);                          \
346                                                                         \
347                 pa0 += 2 * 16;                                          \
348                 x += inc_x2 * 16;                                       \
349             }                                                           \
350                                                                         \
351             x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);        \
352             x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);        \
353             src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);      \
354             src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);      \
355                                                                         \
356             tp0r += src0r * x0r;                                        \
357             x2 = LD_DP(x + 2 * inc_x2);                                 \
358             x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);        \
359                                                                         \
360             tp0i OP1 src0r * x0i;                                       \
361             x3 = LD_DP(x + 3 * inc_x2);                                 \
362             x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);        \
363                                                                         \
364             tp1r OP0 src0i * x0i;                                       \
365             t2 = LD_DP(pa0 + 4);                                        \
366             src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);      \
367                                                                         \
368             tp1i OP2 src0i * x0r;                                       \
369             t3 = LD_DP(pa0 + 6);                                        \
370             src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);      \
371                                                                         \
372             tp2r += src2r * x2r;                                        \
373             x6 = LD_DP(x + 6 * inc_x2);                                 \
374                                                                         \
375             tp2i OP1 src2r * x2i;                                       \
376             x7 = LD_DP(x + 7 * inc_x2);                                 \
377                                                                         \
378             tp3r OP0 src2i * x2i;                                       \
379             t6 = LD_DP(pa0 + 12);                                       \
380                                                                         \
381             tp3i OP2 src2i * x2r;                                       \
382             t7 = LD_DP(pa0 + 14);                                       \
383                                                                         \
384             x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);        \
385             x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);        \
386             src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);      \
387             src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);      \
388                                                                         \
389             tp0r += src1r * x1r;                                        \
390             x0 = LD_DP(x +  8 * inc_x2);                                \
391             x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);        \
392                                                                         \
393             tp0i OP1 src1r * x1i;                                       \
394             x1 = LD_DP(x +  9 * inc_x2);                                \
395             x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);        \
396                                                                         \
397             tp1r OP0 src1i * x1i;                                       \
398             t0 = LD_DP(pa0 + 16);                                       \
399             src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);      \
400                                                                         \
401             tp1i OP2 src1i * x1r;                                       \
402             t1 = LD_DP(pa0 + 18);                                       \
403             src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);      \
404                                                                         \
405             tp2r += src3r * x3r;                                        \
406             x4 = LD_DP(x + 12 * inc_x2);                                \
407                                                                         \
408             tp2i OP1 src3r * x3i;                                       \
409             x5 = LD_DP(x + 13 * inc_x2);                                \
410                                                                         \
411             tp3r OP0 src3i * x3i;                                       \
412             t4 = LD_DP(pa0 + 24);                                       \
413                                                                         \
414             tp3i OP2 src3i * x3r;                                       \
415             t5 = LD_DP(pa0 + 26);                                       \
416                                                                         \
417             x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0);        \
418             x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0);        \
419             src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0);      \
420             src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0);      \
421                                                                         \
422             tp0r += src0r * x0r;                                        \
423             x2 = LD_DP(x + 10 * inc_x2);                                \
424             x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);        \
425                                                                         \
426             tp0i OP1 src0r * x0i;                                       \
427             x3 = LD_DP(x + 11 * inc_x2);                                \
428             x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4);        \
429                                                                         \
430             tp1r OP0 src0i * x0i;                                       \
431             t2 = LD_DP(pa0 + 20);                                       \
432             src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4);      \
433                                                                         \
434             tp1i OP2 src0i * x0r;                                       \
435             t3 = LD_DP(pa0 + 22);                                       \
436             src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4);      \
437                                                                         \
438             tp2r += src2r * x2r;                                        \
439             x6 = LD_DP(x + 14 * inc_x2);                                \
440                                                                         \
441             tp2i OP1 src2r * x2i;                                       \
442             x7 = LD_DP(x + 15 * inc_x2);                                \
443                                                                         \
444             tp3r OP0 src2i * x2i;                                       \
445             t6 = LD_DP(pa0 + 28);                                       \
446                                                                         \
447             tp3i OP2 src2i * x2r;                                       \
448             t7 = LD_DP(pa0 + 30);                                       \
449                                                                         \
450             x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2);        \
451             x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2);        \
452             src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2);      \
453             src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2);      \
454                                                                         \
455             tp0r += src1r * x1r;                                        \
456             x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6);        \
457                                                                         \
458             tp0i OP1 src1r * x1i;                                       \
459             x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6);        \
460                                                                         \
461             tp1r OP0 src1i * x1i;                                       \
462             src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6);      \
463                                                                         \
464             tp1i OP2 src1i * x1r;                                       \
465             src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6);      \
466                                                                         \
467             tp2r += src3r * x3r;                                        \
468             tp2i OP1 src3r * x3i;                                       \
469             tp3r OP0 src3i * x3i;                                       \
470             tp3i OP2 src3i * x3r;                                       \
471                                                                         \
472             pa0 += 2 * 16;                                              \
473             x += inc_x2 * 16;                                           \
474                                                                         \
475             tp0r += tp1r + tp2r + tp3r;                                 \
476             tp0i += tp1i + tp2i + tp3i;                                 \
477         }                                                               \
478                                                                         \
479         if (m & 8)                                                      \
480         {                                                               \
481             ZLOAD_X8();                                                 \
482             ZGEMV_T_8x1();                                              \
483                                                                         \
484             pa0 += 2 * 8;                                               \
485             x += inc_x2 * 8;                                            \
486         }                                                               \
487                                                                         \
488         if (m & 4)                                                      \
489         {                                                               \
490             ZLOAD_X4();                                                 \
491             ZGEMV_T_4x1();                                              \
492                                                                         \
493             pa0 += 2 * 4;                                               \
494             x += inc_x2 * 4;                                            \
495         }                                                               \
496                                                                         \
497         if (m & 2)                                                      \
498         {                                                               \
499             ZLOAD_X2();                                                 \
500             ZGEMV_T_2x1();                                              \
501                                                                         \
502             pa0 += 2 * 2;                                               \
503             x += inc_x2 * 2;                                            \
504         }                                                               \
505                                                                         \
506         temp0r = tp0r[0] + tp0r[1];                                     \
507         temp0i = tp0i[0] + tp0i[1];                                     \
508                                                                         \
509         if (m & 1)                                                      \
510         {                                                               \
511             ZGEMV_T_1x1();                                              \
512                                                                         \
513             pa0 += 2;                                                   \
514             x += inc_x2;                                                \
515         }                                                               \
516                                                                         \
517         ZSCALE_STORE_Y1_GP();                                           \
518                                                                         \
519         A += lda2;                                                      \
520         y += inc_y2;                                                    \
521     }                                                                   \
522
523 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
524           FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
525           BLASLONG inc_y, FLOAT *buffer)
526 {
527     BLASLONG i, j, pref_offset;
528     BLASLONG inc_x2, inc_y2, lda2;
529     FLOAT *pa0, *pa0_pref;
530     FLOAT *srcx_org = x;
531     FLOAT temp0r, temp0i;
532     FLOAT res0r, res0i;
533     v2f64 zero = {0};
534     v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
535     v2f64 x4, x5, x6, x7, x2r, x3r, x2i, x3i;
536     v2f64 t0, t1, t2, t3, t4, t5, t6, t7;
537     v2f64 src0r, src1r, src2r, src3r;
538     v2f64 src0i, src1i, src2i, src3i;
539     v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
540
541     lda2 = 2 * lda;
542
543     inc_x2 = 2 * inc_x;
544     inc_y2 = 2 * inc_y;
545
546     pref_offset = (uintptr_t)A & L1_DATA_LINESIZE;
547     pref_offset = L1_DATA_LINESIZE - pref_offset;
548     pref_offset = pref_offset / sizeof(FLOAT);
549
550     if (2 == inc_x2)
551     {
552         #define ZLOAD_X8  ZLOAD_X8_VECTOR
553         #define ZLOAD_X4  ZLOAD_X4_VECTOR
554         #define ZLOAD_X2  ZLOAD_X2_VECTOR
555
556         ZGEMV_T_MSA();
557
558         #undef ZLOAD_X8
559         #undef ZLOAD_X4
560         #undef ZLOAD_X2
561     }
562     else
563     {
564         #define ZLOAD_X8  ZLOAD_X8_GP
565         #define ZLOAD_X4  ZLOAD_X4_GP
566         #define ZLOAD_X2  ZLOAD_X2_GP
567
568         ZGEMV_T_MSA();
569
570         #undef ZLOAD_X8
571         #undef ZLOAD_X4
572         #undef ZLOAD_X2
573     }
574     return(0);
575 }
576
577 #undef OP0
578 #undef OP1
579 #undef OP2