fix build error
[platform/upstream/openblas.git] / kernel / mips / dgemv_n_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #define DGEMV_N_8x8()                        \
32 {                                            \
33     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
34     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
35     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
36     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
37     LD_DP4(pa4 + k, 2, t16, t17, t18, t19);  \
38     LD_DP4(pa5 + k, 2, t20, t21, t22, t23);  \
39     LD_DP4(pa6 + k, 2, t24, t25, t26, t27);  \
40     LD_DP4(pa7 + k, 2, t28, t29, t30, t31);  \
41                                              \
42     y0 += tp0 * t0;                          \
43     y1 += tp0 * t1;                          \
44     y2 += tp0 * t2;                          \
45     y3 += tp0 * t3;                          \
46                                              \
47     y0 += tp1 * t4;                          \
48     y1 += tp1 * t5;                          \
49     y2 += tp1 * t6;                          \
50     y3 += tp1 * t7;                          \
51                                              \
52     y0 += tp2 * t8;                          \
53     y1 += tp2 * t9;                          \
54     y2 += tp2 * t10;                         \
55     y3 += tp2 * t11;                         \
56                                              \
57     y0 += tp3 * t12;                         \
58     y1 += tp3 * t13;                         \
59     y2 += tp3 * t14;                         \
60     y3 += tp3 * t15;                         \
61                                              \
62     y0 += tp4 * t16;                         \
63     y1 += tp4 * t17;                         \
64     y2 += tp4 * t18;                         \
65     y3 += tp4 * t19;                         \
66                                              \
67     y0 += tp5 * t20;                         \
68     y1 += tp5 * t21;                         \
69     y2 += tp5 * t22;                         \
70     y3 += tp5 * t23;                         \
71                                              \
72     y0 += tp6 * t24;                         \
73     y1 += tp6 * t25;                         \
74     y2 += tp6 * t26;                         \
75     y3 += tp6 * t27;                         \
76                                              \
77     y0 += tp7 * t28;                         \
78     y1 += tp7 * t29;                         \
79     y2 += tp7 * t30;                         \
80     y3 += tp7 * t31;                         \
81 }
82
83 #define DGEMV_N_4x8()              \
84 {                                  \
85     LD_DP2(pa0 + k, 2, t0, t1);    \
86     LD_DP2(pa1 + k, 2, t4, t5);    \
87     LD_DP2(pa2 + k, 2, t8, t9);    \
88     LD_DP2(pa3 + k, 2, t12, t13);  \
89     LD_DP2(pa4 + k, 2, t16, t17);  \
90     LD_DP2(pa5 + k, 2, t20, t21);  \
91     LD_DP2(pa6 + k, 2, t24, t25);  \
92     LD_DP2(pa7 + k, 2, t28, t29);  \
93                                    \
94     y0 += tp0 * t0;                \
95     y1 += tp0 * t1;                \
96                                    \
97     y0 += tp1 * t4;                \
98     y1 += tp1 * t5;                \
99                                    \
100     y0 += tp2 * t8;                \
101     y1 += tp2 * t9;                \
102                                    \
103     y0 += tp3 * t12;               \
104     y1 += tp3 * t13;               \
105                                    \
106     y0 += tp4 * t16;               \
107     y1 += tp4 * t17;               \
108                                    \
109     y0 += tp5 * t20;               \
110     y1 += tp5 * t21;               \
111                                    \
112     y0 += tp6 * t24;               \
113     y1 += tp6 * t25;               \
114                                    \
115     y0 += tp7 * t28;               \
116     y1 += tp7 * t29;               \
117 }
118
119 #define DGEMV_N_8x4()                        \
120 {                                            \
121     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
122     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
123     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
124     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
125                                              \
126     y0 += tp0 * t0;                          \
127     y1 += tp0 * t1;                          \
128     y2 += tp0 * t2;                          \
129     y3 += tp0 * t3;                          \
130                                              \
131     y0 += tp1 * t4;                          \
132     y1 += tp1 * t5;                          \
133     y2 += tp1 * t6;                          \
134     y3 += tp1 * t7;                          \
135                                              \
136     y0 += tp2 * t8;                          \
137     y1 += tp2 * t9;                          \
138     y2 += tp2 * t10;                         \
139     y3 += tp2 * t11;                         \
140                                              \
141     y0 += tp3 * t12;                         \
142     y1 += tp3 * t13;                         \
143     y2 += tp3 * t14;                         \
144     y3 += tp3 * t15;                         \
145 }
146
147 #define DGEMV_N_4x4()              \
148 {                                  \
149     LD_DP2(pa0 + k, 2, t0, t1);    \
150     LD_DP2(pa1 + k, 2, t4, t5);    \
151     LD_DP2(pa2 + k, 2, t8, t9);    \
152     LD_DP2(pa3 + k, 2, t12, t13);  \
153                                    \
154     y0 += tp0 * t0;                \
155     y1 += tp0 * t1;                \
156                                    \
157     y0 += tp1 * t4;                \
158     y1 += tp1 * t5;                \
159                                    \
160     y0 += tp2 * t8;                \
161     y1 += tp2 * t9;                \
162                                    \
163     y0 += tp3 * t12;               \
164     y1 += tp3 * t13;               \
165 }
166
167 #define DGEMV_N_8x2()                    \
168 {                                        \
169     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);  \
170     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);  \
171                                          \
172     y0 += tp0 * t0;                      \
173     y1 += tp0 * t1;                      \
174     y2 += tp0 * t2;                      \
175     y3 += tp0 * t3;                      \
176                                          \
177     y0 += tp1 * t4;                      \
178     y1 += tp1 * t5;                      \
179     y2 += tp1 * t6;                      \
180     y3 += tp1 * t7;                      \
181 }
182
183 #define DGEMV_N_4x2()            \
184 {                                \
185     LD_DP2(pa0 + k, 2, t0, t1);  \
186     LD_DP2(pa1 + k, 2, t4, t5);  \
187                                  \
188     y0 += tp0 * t0;              \
189     y1 += tp0 * t1;              \
190                                  \
191     y0 += tp1 * t4;              \
192     y1 += tp1 * t5;              \
193 }
194
195 #define DLOAD_X8_SCALE_GP()             \
196    temp0 = alpha * x[0 * inc_x];        \
197    temp1 = alpha * x[1 * inc_x];        \
198    temp2 = alpha * x[2 * inc_x];        \
199    temp3 = alpha * x[3 * inc_x];        \
200    temp4 = alpha * x[4 * inc_x];        \
201    temp5 = alpha * x[5 * inc_x];        \
202    temp6 = alpha * x[6 * inc_x];        \
203    temp7 = alpha * x[7 * inc_x];        \
204                                         \
205    tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
206    tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
207    tp2 = COPY_DOUBLE_TO_VECTOR(temp2);  \
208    tp3 = COPY_DOUBLE_TO_VECTOR(temp3);  \
209    tp4 = COPY_DOUBLE_TO_VECTOR(temp4);  \
210    tp5 = COPY_DOUBLE_TO_VECTOR(temp5);  \
211    tp6 = COPY_DOUBLE_TO_VECTOR(temp6);  \
212    tp7 = COPY_DOUBLE_TO_VECTOR(temp7);  \
213
214 #define  DLOAD_X4_SCALE_GP()             \
215     temp0 = alpha * x[0 * inc_x];        \
216     temp1 = alpha * x[1 * inc_x];        \
217     temp2 = alpha * x[2 * inc_x];        \
218     temp3 = alpha * x[3 * inc_x];        \
219                                          \
220     tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
221     tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
222     tp2 = COPY_DOUBLE_TO_VECTOR(temp2);  \
223     tp3 = COPY_DOUBLE_TO_VECTOR(temp3);  \
224
225 #define DLOAD_X8_SCALE_VECTOR()    \
226     LD_DP4(x, 2, x0, x1, x2, x3);  \
227                                    \
228     x0 = x0 * v_alpha;             \
229     x1 = x1 * v_alpha;             \
230     x2 = x2 * v_alpha;             \
231     x3 = x3 * v_alpha;             \
232                                    \
233     SPLATI_D2_DP(x0, tp0, tp1);    \
234     SPLATI_D2_DP(x1, tp2, tp3);    \
235     SPLATI_D2_DP(x2, tp4, tp5);    \
236     SPLATI_D2_DP(x3, tp6, tp7);    \
237
238 #define DLOAD_X4_SCALE_VECTOR()  \
239     LD_DP2(x, 2, x0, x1);        \
240                                  \
241     x0 = x0 * v_alpha;           \
242     x1 = x1 * v_alpha;           \
243                                  \
244     SPLATI_D2_DP(x0, tp0, tp1);  \
245     SPLATI_D2_DP(x1, tp2, tp3);  \
246
247 #define DLOAD_Y8_GP()                                                              \
248     y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y)));  \
249     y0 = (v2f64) __msa_insert_d((v2i64) y0,  1, *((long long *)(y + 1 * inc_y)));  \
250     y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y)));  \
251     y1 = (v2f64) __msa_insert_d((v2i64) y1,  1, *((long long *)(y + 3 * inc_y)));  \
252     y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y)));  \
253     y2 = (v2f64) __msa_insert_d((v2i64) y2,  1, *((long long *)(y + 5 * inc_y)));  \
254     y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y)));  \
255     y3 = (v2f64) __msa_insert_d((v2i64) y3,  1, *((long long *)(y + 7 * inc_y)));  \
256
257 #define DLOAD_Y4_GP()                                                              \
258     y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y)));  \
259     y0 = (v2f64) __msa_insert_d((v2i64) y0,  1, *((long long *)(y + 1 * inc_y)));  \
260     y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y)));  \
261     y1 = (v2f64) __msa_insert_d((v2i64) y1,  1, *((long long *)(y + 3 * inc_y)));  \
262
263 #define DLOAD_Y8_VECTOR()  LD_DP4(y, 2, y0, y1, y2, y3);
264 #define DLOAD_Y4_VECTOR()  LD_DP2(y, 2, y0, y1);
265
266 #define DSTORE_Y8_GP()                                                \
267     *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0);  \
268     *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1);  \
269     *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0);  \
270     *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1);  \
271     *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0);  \
272     *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1);  \
273     *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0);  \
274     *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1);  \
275
276 #define DSTORE_Y4_GP()                                                \
277     *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0);  \
278     *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1);  \
279     *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0);  \
280     *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1);  \
281
282 #define DSTORE_Y8_VECTOR()  ST_DP4(y0, y1, y2, y3, y, 2);
283 #define DSTORE_Y4_VECTOR()  ST_DP2(y0, y1, y, 2);
284
285 #define DGEMV_N_MSA()                        \
286     for (j = (n >> 3); j--;)                 \
287     {                                        \
288         DLOAD_X8_SCALE();                    \
289                                              \
290         k = 0;                               \
291         y = y_org;                           \
292                                              \
293         for (i = (m >> 3); i--;)             \
294         {                                    \
295             DLOAD_Y8();                      \
296             DGEMV_N_8x8();                   \
297             DSTORE_Y8();                     \
298                                              \
299             y += 8 * inc_y;                  \
300             k += 8;                          \
301         }                                    \
302                                              \
303         if (m & 4)                           \
304         {                                    \
305             DLOAD_Y4();                      \
306             DGEMV_N_4x8();                   \
307             DSTORE_Y4();                     \
308                                              \
309             y += 4 * inc_y;                  \
310             k += 4;                          \
311         }                                    \
312                                              \
313         if (m & 3)                           \
314         {                                    \
315             temp0 = alpha * x[0 * inc_x];    \
316             temp1 = alpha * x[1 * inc_x];    \
317             temp2 = alpha * x[2 * inc_x];    \
318             temp3 = alpha * x[3 * inc_x];    \
319             temp4 = alpha * x[4 * inc_x];    \
320             temp5 = alpha * x[5 * inc_x];    \
321             temp6 = alpha * x[6 * inc_x];    \
322             temp7 = alpha * x[7 * inc_x];    \
323                                              \
324             for (i = (m & 3); i--;)          \
325             {                                \
326                 temp = y[0];                 \
327                 temp += temp0 * pa0[k];      \
328                 temp += temp1 * pa1[k];      \
329                 temp += temp2 * pa2[k];      \
330                 temp += temp3 * pa3[k];      \
331                 temp += temp4 * pa4[k];      \
332                 temp += temp5 * pa5[k];      \
333                 temp += temp6 * pa6[k];      \
334                 temp += temp7 * pa7[k];      \
335                 y[0] = temp;                 \
336                                              \
337                 y += inc_y;                  \
338                 k++;                         \
339             }                                \
340         }                                    \
341         pa0 += 8 * lda;                      \
342         pa1 += 8 * lda;                      \
343         pa2 += 8 * lda;                      \
344         pa3 += 8 * lda;                      \
345         pa4 += 8 * lda;                      \
346         pa5 += 8 * lda;                      \
347         pa6 += 8 * lda;                      \
348         pa7 += 8 * lda;                      \
349                                              \
350         x += 8 * inc_x;                      \
351     }                                        \
352                                              \
353     if (n & 4)                               \
354     {                                        \
355         DLOAD_X4_SCALE();                    \
356                                              \
357         k = 0;                               \
358         y = y_org;                           \
359                                              \
360         for (i = (m >> 3); i--;)             \
361         {                                    \
362             DLOAD_Y8();                      \
363             DGEMV_N_8x4();                   \
364             DSTORE_Y8();                     \
365                                              \
366             y += 8 * inc_y;                  \
367             k += 8;                          \
368         }                                    \
369                                              \
370         if (m & 4)                           \
371         {                                    \
372             DLOAD_Y4();                      \
373             DGEMV_N_4x4();                   \
374             DSTORE_Y4();                     \
375                                              \
376             y += 4 * inc_y;                  \
377             k += 4;                          \
378         }                                    \
379                                              \
380         if (m & 3)                           \
381         {                                    \
382             temp0 = alpha * x[0 * inc_x];    \
383             temp1 = alpha * x[1 * inc_x];    \
384             temp2 = alpha * x[2 * inc_x];    \
385             temp3 = alpha * x[3 * inc_x];    \
386                                              \
387             for (i = (m & 3); i--;)          \
388             {                                \
389                 temp = y[0];                 \
390                 temp += temp0 * pa0[k];      \
391                 temp += temp1 * pa1[k];      \
392                 temp += temp2 * pa2[k];      \
393                 temp += temp3 * pa3[k];      \
394                 y[0] = temp;                 \
395                                              \
396                 y += inc_y;                  \
397                 k++;                         \
398             }                                \
399         }                                    \
400                                              \
401         pa0 += 4 * lda;                      \
402         pa1 += 4 * lda;                      \
403         pa2 += 4 * lda;                      \
404         pa3 += 4 * lda;                      \
405                                              \
406         x += 4 * inc_x;                      \
407     }                                        \
408                                              \
409     if (n & 2)                               \
410     {                                        \
411         temp0 = alpha * x[0 * inc_x];        \
412         temp1 = alpha * x[1 * inc_x];        \
413                                              \
414         tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
415         tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
416                                              \
417         k = 0;                               \
418         y = y_org;                           \
419                                              \
420         for (i = (m >> 3); i--;)             \
421         {                                    \
422             DLOAD_Y8();                      \
423             DGEMV_N_8x2();                   \
424             DSTORE_Y8();                     \
425                                              \
426             y += 8 * inc_y;                  \
427             k += 8;                          \
428         }                                    \
429                                              \
430         if (m & 4)                           \
431         {                                    \
432             DLOAD_Y4();                      \
433             DGEMV_N_4x2();                   \
434             DSTORE_Y4();                     \
435                                              \
436             y += 4 * inc_y;                  \
437             k += 4;                          \
438         }                                    \
439                                              \
440         if (m & 3)                           \
441         {                                    \
442             temp0 = alpha * x[0 * inc_x];    \
443             temp1 = alpha * x[1 * inc_x];    \
444                                              \
445             for (i = (m & 3); i--;)          \
446             {                                \
447                 temp = y[0];                 \
448                 temp += temp0 * pa0[k];      \
449                 temp += temp1 * pa1[k];      \
450                 y[0] = temp;                 \
451                                              \
452                 y += inc_y;                  \
453                 k++;                         \
454             }                                \
455         }                                    \
456                                              \
457         pa0 += 2 * lda;                      \
458         pa1 += 2 * lda;                      \
459                                              \
460         x += 2 * inc_x;                      \
461     }                                        \
462                                              \
463     if (n & 1)                               \
464     {                                        \
465         temp = alpha * x[0];                 \
466                                              \
467         k = 0;                               \
468         y = y_org;                           \
469                                              \
470         for (i = m; i--;)                    \
471         {                                    \
472            y[0] += temp * pa0[k];            \
473            y += inc_y;                       \
474            k++;                              \
475         }                                    \
476     }                                        \
477
478 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
479           BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
480           FLOAT *buffer)
481 {
482     BLASLONG i, j, k;
483     FLOAT *y_org = y;
484     FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
485     FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
486     v2f64 v_alpha;
487     v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
488     v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
489     v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
490     v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
491
492     v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
493
494     pa0 = A;
495     pa1 = A + lda;
496     pa2 = A + 2 * lda;
497     pa3 = A + 3 * lda;
498     pa4 = A + 4 * lda;
499     pa5 = A + 5 * lda;
500     pa6 = A + 6 * lda;
501     pa7 = A + 7 * lda;
502
503     if ((1 == inc_x) && (1 == inc_y))
504     {
505         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_VECTOR
506         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_VECTOR
507         #define DLOAD_Y8   DLOAD_Y8_VECTOR
508         #define DLOAD_Y4   DLOAD_Y4_VECTOR
509         #define DSTORE_Y8  DSTORE_Y8_VECTOR
510         #define DSTORE_Y4  DSTORE_Y4_VECTOR
511
512         DGEMV_N_MSA();
513
514         #undef DLOAD_X8_SCALE
515         #undef DLOAD_X4_SCALE
516         #undef DLOAD_Y8
517         #undef DLOAD_Y4
518         #undef DSTORE_Y8
519         #undef DSTORE_Y4
520     }
521     else if (1 == inc_y)
522     {
523         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_GP
524         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_GP
525         #define DLOAD_Y8   DLOAD_Y8_VECTOR
526         #define DLOAD_Y4   DLOAD_Y4_VECTOR
527         #define DSTORE_Y8  DSTORE_Y8_VECTOR
528         #define DSTORE_Y4  DSTORE_Y4_VECTOR
529
530         DGEMV_N_MSA();
531
532         #undef DLOAD_X8_SCALE
533         #undef DLOAD_X4_SCALE
534         #undef DLOAD_Y8
535         #undef DLOAD_Y4
536         #undef DSTORE_Y8
537         #undef DSTORE_Y4
538     }
539     else if (1 == inc_x)
540     {
541         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_VECTOR
542         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_VECTOR
543         #define DLOAD_Y8   DLOAD_Y8_GP
544         #define DLOAD_Y4   DLOAD_Y4_GP
545         #define DSTORE_Y8  DSTORE_Y8_GP
546         #define DSTORE_Y4  DSTORE_Y4_GP
547
548         DGEMV_N_MSA();
549
550         #undef DLOAD_X8_SCALE
551         #undef DLOAD_X4_SCALE
552         #undef DLOAD_Y8
553         #undef DLOAD_Y4
554         #undef DSTORE_Y8
555         #undef DSTORE_Y4
556     }
557     else
558     {
559         #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_GP
560         #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_GP
561         #define DLOAD_Y8   DLOAD_Y8_GP
562         #define DLOAD_Y4   DLOAD_Y4_GP
563         #define DSTORE_Y8  DSTORE_Y8_GP
564         #define DSTORE_Y4  DSTORE_Y4_GP
565
566         DGEMV_N_MSA();
567
568         #undef DLOAD_X8_SCALE
569         #undef DLOAD_X4_SCALE
570         #undef DLOAD_Y8
571         #undef DLOAD_Y4
572         #undef DSTORE_Y8
573         #undef DSTORE_Y4
574     }
575
576     return(0);
577 }