fix build error
[platform/upstream/openblas.git] / kernel / mips / dgemv_t_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #define DGEMV_T_8x8()                        \
32 {                                            \
33     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
34     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
35     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
36     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
37     LD_DP4(pa4 + k, 2, t16, t17, t18, t19);  \
38     LD_DP4(pa5 + k, 2, t20, t21, t22, t23);  \
39     LD_DP4(pa6 + k, 2, t24, t25, t26, t27);  \
40     LD_DP4(pa7 + k, 2, t28, t29, t30, t31);  \
41                                              \
42     tp0 += x0 * t0;                          \
43     tp0 += x1 * t1;                          \
44     tp0 += x2 * t2;                          \
45     tp0 += x3 * t3;                          \
46                                              \
47     tp1 += x0 * t4;                          \
48     tp1 += x1 * t5;                          \
49     tp1 += x2 * t6;                          \
50     tp1 += x3 * t7;                          \
51                                              \
52     tp2 += x0 * t8;                          \
53     tp2 += x1 * t9;                          \
54     tp2 += x2 * t10;                         \
55     tp2 += x3 * t11;                         \
56                                              \
57     tp3 += x0 * t12;                         \
58     tp3 += x1 * t13;                         \
59     tp3 += x2 * t14;                         \
60     tp3 += x3 * t15;                         \
61                                              \
62     tp4 += x0 * t16;                         \
63     tp4 += x1 * t17;                         \
64     tp4 += x2 * t18;                         \
65     tp4 += x3 * t19;                         \
66                                              \
67     tp5 += x0 * t20;                         \
68     tp5 += x1 * t21;                         \
69     tp5 += x2 * t22;                         \
70     tp5 += x3 * t23;                         \
71                                              \
72     tp6 += x0 * t24;                         \
73     tp6 += x1 * t25;                         \
74     tp6 += x2 * t26;                         \
75     tp6 += x3 * t27;                         \
76                                              \
77     tp7 += x0 * t28;                         \
78     tp7 += x1 * t29;                         \
79     tp7 += x2 * t30;                         \
80     tp7 += x3 * t31;                         \
81 }
82
83 #define DGEMV_T_8x4()              \
84 {                                  \
85     LD_DP2(pa0 + k, 2, t0, t1);    \
86     LD_DP2(pa1 + k, 2, t4, t5);    \
87     LD_DP2(pa2 + k, 2, t8, t9);    \
88     LD_DP2(pa3 + k, 2, t12, t13);  \
89     LD_DP2(pa4 + k, 2, t16, t17);  \
90     LD_DP2(pa5 + k, 2, t20, t21);  \
91     LD_DP2(pa6 + k, 2, t24, t25);  \
92     LD_DP2(pa7 + k, 2, t28, t29);  \
93                                    \
94     tp0 += x0 * t0;                \
95     tp0 += x1 * t1;                \
96                                    \
97     tp1 += x0 * t4;                \
98     tp1 += x1 * t5;                \
99                                    \
100     tp2 += x0 * t8;                \
101     tp2 += x1 * t9;                \
102                                    \
103     tp3 += x0 * t12;               \
104     tp3 += x1 * t13;               \
105                                    \
106     tp4 += x0 * t16;               \
107     tp4 += x1 * t17;               \
108                                    \
109     tp5 += x0 * t20;               \
110     tp5 += x1 * t21;               \
111                                    \
112     tp6 += x0 * t24;               \
113     tp6 += x1 * t25;               \
114                                    \
115     tp7 += x0 * t28;               \
116     tp7 += x1 * t29;               \
117 }
118
119 #define DGEMV_T_8x2()      \
120 {                          \
121     t0  = LD_DP(pa0 + k);  \
122     t4  = LD_DP(pa1 + k);  \
123     t8  = LD_DP(pa2 + k);  \
124     t12 = LD_DP(pa3 + k);  \
125     t16 = LD_DP(pa4 + k);  \
126     t20 = LD_DP(pa5 + k);  \
127     t24 = LD_DP(pa6 + k);  \
128     t28 = LD_DP(pa7 + k);  \
129                            \
130     tp0 += x0 * t0;        \
131     tp1 += x0 * t4;        \
132     tp2 += x0 * t8;        \
133     tp3 += x0 * t12;       \
134     tp4 += x0 * t16;       \
135     tp5 += x0 * t20;       \
136     tp6 += x0 * t24;       \
137     tp7 += x0 * t28;       \
138 }
139
140 #define DGEMV_T_4x8()                        \
141 {                                            \
142     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
143     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
144     LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
145     LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
146                                              \
147     tp0 += x0 * t0;                          \
148     tp0 += x1 * t1;                          \
149     tp0 += x2 * t2;                          \
150     tp0 += x3 * t3;                          \
151                                              \
152     tp1 += x0 * t4;                          \
153     tp1 += x1 * t5;                          \
154     tp1 += x2 * t6;                          \
155     tp1 += x3 * t7;                          \
156                                              \
157     tp2 += x0 * t8;                          \
158     tp2 += x1 * t9;                          \
159     tp2 += x2 * t10;                         \
160     tp2 += x3 * t11;                         \
161                                              \
162     tp3 += x0 * t12;                         \
163     tp3 += x1 * t13;                         \
164     tp3 += x2 * t14;                         \
165     tp3 += x3 * t15;                         \
166 }
167
168 #define DGEMV_T_4x4()              \
169 {                                  \
170     LD_DP2(pa0 + k, 2, t0, t1);    \
171     LD_DP2(pa1 + k, 2, t4, t5);    \
172     LD_DP2(pa2 + k, 2, t8, t9);    \
173     LD_DP2(pa3 + k, 2, t12, t13);  \
174                                    \
175     tp0 += x0 * t0;                \
176     tp0 += x1 * t1;                \
177                                    \
178     tp1 += x0 * t4;                \
179     tp1 += x1 * t5;                \
180                                    \
181     tp2 += x0 * t8;                \
182     tp2 += x1 * t9;                \
183                                    \
184     tp3 += x0 * t12;               \
185     tp3 += x1 * t13;               \
186 }
187
188 #define DGEMV_T_4x2()      \
189 {                          \
190     t0  = LD_DP(pa0 + k);  \
191     t4  = LD_DP(pa1 + k);  \
192     t8  = LD_DP(pa2 + k);  \
193     t12 = LD_DP(pa3 + k);  \
194                            \
195     tp0 += x0 * t0;        \
196     tp1 += x0 * t4;        \
197     tp2 += x0 * t8;        \
198     tp3 += x0 * t12;       \
199 }
200
201 #define DGEMV_T_2x8()                    \
202 {                                        \
203     LD_DP4(pa0 + k, 2, t0, t1, t2, t3);  \
204     LD_DP4(pa1 + k, 2, t4, t5, t6, t7);  \
205                                          \
206     tp0 += x0 * t0;                      \
207     tp0 += x1 * t1;                      \
208     tp0 += x2 * t2;                      \
209     tp0 += x3 * t3;                      \
210                                          \
211     tp1 += x0 * t4;                      \
212     tp1 += x1 * t5;                      \
213     tp1 += x2 * t6;                      \
214     tp1 += x3 * t7;                      \
215 }
216
217 #define DGEMV_T_2x4()            \
218 {                                \
219     LD_DP2(pa0 + k, 2, t0, t1);  \
220     LD_DP2(pa1 + k, 2, t4, t5);  \
221                                  \
222     tp0 += x0 * t0;              \
223     tp0 += x1 * t1;              \
224                                  \
225     tp1 += x0 * t4;              \
226     tp1 += x1 * t5;              \
227 }
228
229 #define DGEMV_T_2x2()     \
230 {                         \
231     t0 = LD_DP(pa0 + k);  \
232     t4 = LD_DP(pa1 + k);  \
233                           \
234     tp0 += x0 * t0;       \
235     tp1 += x0 * t4;       \
236 }
237
238 #define DLOAD_X8_GP()                                                              \
239     x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x)));  \
240     x0 = (v2f64) __msa_insert_d((v2i64) x0,  1, *((long long *)(x + 1 * inc_x)));  \
241     x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x)));  \
242     x1 = (v2f64) __msa_insert_d((v2i64) x1,  1, *((long long *)(x + 3 * inc_x)));  \
243     x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x)));  \
244     x2 = (v2f64) __msa_insert_d((v2i64) x2,  1, *((long long *)(x + 5 * inc_x)));  \
245     x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x)));  \
246     x3 = (v2f64) __msa_insert_d((v2i64) x3,  1, *((long long *)(x + 7 * inc_x)));  \
247
248 #define DLOAD_X4_GP()                                                              \
249     x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x)));  \
250     x0 = (v2f64) __msa_insert_d((v2i64) x0,  1, *((long long *)(x + 1 * inc_x)));  \
251     x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x)));  \
252     x1 = (v2f64) __msa_insert_d((v2i64) x1,  1, *((long long *)(x + 3 * inc_x)));  \
253
254 #define DLOAD_X2_GP()                                                               \
255     x0 = (v2f64) __msa_insert_d((v2i64) tp0,  0, *((long long *)(x + 0 * inc_x)));  \
256     x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x)));    \
257
258 #define DLOAD_X8_VECTOR()  LD_DP4(x, 2, x0, x1, x2, x3);
259 #define DLOAD_X4_VECTOR()  LD_DP2(x, 2, x0, x1);
260 #define DLOAD_X2_VECTOR()  x0 = LD_DP(x);
261
262 #define DGEMV_T_MSA()                   \
263     for (j = (n >> 3); j--;)            \
264     {                                   \
265         tp0 = zero;                     \
266         tp1 = zero;                     \
267         tp2 = zero;                     \
268         tp3 = zero;                     \
269         tp4 = zero;                     \
270         tp5 = zero;                     \
271         tp6 = zero;                     \
272         tp7 = zero;                     \
273                                         \
274         k = 0;                          \
275         x = srcx_org;                   \
276                                         \
277         for (i = (m >> 3); i--;)        \
278         {                               \
279             DLOAD_X8();                 \
280             DGEMV_T_8x8();              \
281                                         \
282             x += 8 * inc_x;             \
283             k += 8;                     \
284         }                               \
285                                         \
286         if (m & 4)                      \
287         {                               \
288             DLOAD_X4();                 \
289             DGEMV_T_8x4();              \
290                                         \
291             x += 4 * inc_x;             \
292             k += 4;                     \
293         }                               \
294                                         \
295         if (m & 2)                      \
296         {                               \
297             DLOAD_X2();                 \
298             DGEMV_T_8x2();              \
299                                         \
300             x += 2 * inc_x;             \
301             k += 2;                     \
302         }                               \
303                                         \
304         ILVRL_D2_DP(tp1, tp0, t0, t4);  \
305         ILVRL_D2_DP(tp3, tp2, t1, t5);  \
306         ILVRL_D2_DP(tp5, tp4, t2, t6);  \
307         ILVRL_D2_DP(tp7, tp6, t3, t7);  \
308         ADD2(t0, t4, t1, t5, t0, t1);   \
309         ADD2(t2, t6, t3, t7, t2, t3);   \
310                                         \
311         temp0 = t0[0];                  \
312         temp1 = t0[1];                  \
313         temp2 = t1[0];                  \
314         temp3 = t1[1];                  \
315         temp4 = t2[0];                  \
316         temp5 = t2[1];                  \
317         temp6 = t3[0];                  \
318         temp7 = t3[1];                  \
319                                         \
320         if (m & 1)                      \
321         {                               \
322             temp0 += pa0[k] * x[0];     \
323             temp1 += pa1[k] * x[0];     \
324             temp2 += pa2[k] * x[0];     \
325             temp3 += pa3[k] * x[0];     \
326             temp4 += pa4[k] * x[0];     \
327             temp5 += pa5[k] * x[0];     \
328             temp6 += pa6[k] * x[0];     \
329             temp7 += pa7[k] * x[0];     \
330                                         \
331             x += inc_x;                 \
332             k++;                        \
333         }                               \
334                                         \
335         res0 = y[0 * inc_y];            \
336         res1 = y[1 * inc_y];            \
337         res2 = y[2 * inc_y];            \
338         res3 = y[3 * inc_y];            \
339         res4 = y[4 * inc_y];            \
340         res5 = y[5 * inc_y];            \
341         res6 = y[6 * inc_y];            \
342         res7 = y[7 * inc_y];            \
343                                         \
344         res0 += alpha * temp0;          \
345         res1 += alpha * temp1;          \
346         res2 += alpha * temp2;          \
347         res3 += alpha * temp3;          \
348         res4 += alpha * temp4;          \
349         res5 += alpha * temp5;          \
350         res6 += alpha * temp6;          \
351         res7 += alpha * temp7;          \
352                                         \
353         y[0 * inc_y] = res0;            \
354         y[1 * inc_y] = res1;            \
355         y[2 * inc_y] = res2;            \
356         y[3 * inc_y] = res3;            \
357         y[4 * inc_y] = res4;            \
358         y[5 * inc_y] = res5;            \
359         y[6 * inc_y] = res6;            \
360         y[7 * inc_y] = res7;            \
361                                         \
362         y += 8 * inc_y;                 \
363                                         \
364         pa0 += 8 * lda;                 \
365         pa1 += 8 * lda;                 \
366         pa2 += 8 * lda;                 \
367         pa3 += 8 * lda;                 \
368         pa4 += 8 * lda;                 \
369         pa5 += 8 * lda;                 \
370         pa6 += 8 * lda;                 \
371         pa7 += 8 * lda;                 \
372     }                                   \
373                                         \
374     if (n & 4)                          \
375     {                                   \
376         tp0 = zero;                     \
377         tp1 = zero;                     \
378         tp2 = zero;                     \
379         tp3 = zero;                     \
380                                         \
381         k = 0;                          \
382         x = srcx_org;                   \
383                                         \
384         for (i = (m >> 3); i--;)        \
385         {                               \
386             DLOAD_X8();                 \
387             DGEMV_T_4x8();              \
388                                         \
389             x += 8 * inc_x;             \
390             k += 8;                     \
391         }                               \
392                                         \
393         if (m & 4)                      \
394         {                               \
395             DLOAD_X4();                 \
396             DGEMV_T_4x4();              \
397                                         \
398             x += 4 * inc_x;             \
399             k += 4;                     \
400         }                               \
401                                         \
402         if (m & 2)                      \
403         {                               \
404             DLOAD_X2();                 \
405             DGEMV_T_4x2();              \
406                                         \
407             x += 2 * inc_x;             \
408             k += 2;                     \
409         }                               \
410                                         \
411         ILVRL_D2_DP(tp1, tp0, t0, t4);  \
412         ILVRL_D2_DP(tp3, tp2, t1, t5);  \
413         ADD2(t0, t4, t1, t5, t0, t1);   \
414                                         \
415         temp0 = t0[0];                  \
416         temp1 = t0[1];                  \
417         temp2 = t1[0];                  \
418         temp3 = t1[1];                  \
419                                         \
420         if (m & 1)                      \
421         {                               \
422             temp0 += pa0[k] * x[0];     \
423             temp1 += pa1[k] * x[0];     \
424             temp2 += pa2[k] * x[0];     \
425             temp3 += pa3[k] * x[0];     \
426                                         \
427             x += inc_x;                 \
428             k++;                        \
429         }                               \
430                                         \
431         res0 = y[0 * inc_y];            \
432         res1 = y[1 * inc_y];            \
433         res2 = y[2 * inc_y];            \
434         res3 = y[3 * inc_y];            \
435                                         \
436         res0 += alpha * temp0;          \
437         res1 += alpha * temp1;          \
438         res2 += alpha * temp2;          \
439         res3 += alpha * temp3;          \
440                                         \
441         y[0 * inc_y] = res0;            \
442         y[1 * inc_y] = res1;            \
443         y[2 * inc_y] = res2;            \
444         y[3 * inc_y] = res3;            \
445                                         \
446         y += 4 * inc_y;                 \
447                                         \
448         pa0 += 4 * lda;                 \
449         pa1 += 4 * lda;                 \
450         pa2 += 4 * lda;                 \
451         pa3 += 4 * lda;                 \
452     }                                   \
453                                         \
454     if (n & 2)                          \
455     {                                   \
456         tp0 = zero;                     \
457         tp1 = zero;                     \
458                                         \
459         k = 0;                          \
460         x = srcx_org;                   \
461                                         \
462         for (i = (m >> 3); i--;)        \
463         {                               \
464             DLOAD_X8();                 \
465             DGEMV_T_2x8();              \
466                                         \
467             x += 8 * inc_x;             \
468             k += 8;                     \
469         }                               \
470                                         \
471         if (m & 4)                      \
472         {                               \
473             DLOAD_X4();                 \
474             DGEMV_T_2x4();              \
475                                         \
476             x += 4 * inc_x;             \
477             k += 4;                     \
478         }                               \
479                                         \
480         if (m & 2)                      \
481         {                               \
482             DLOAD_X2();                 \
483             DGEMV_T_2x2();              \
484                                         \
485             x += 2 * inc_x;             \
486             k += 2;                     \
487         }                               \
488                                         \
489         ILVRL_D2_DP(tp1, tp0, t0, t4);  \
490                                         \
491         t0 += t4;                       \
492                                         \
493         temp0 = t0[0];                  \
494         temp1 = t0[1];                  \
495                                         \
496         if (m & 1)                      \
497         {                               \
498             temp0 += pa0[k] * x[0];     \
499             temp1 += pa1[k] * x[0];     \
500             x += inc_x;                 \
501             k++;                        \
502         }                               \
503                                         \
504         res0 = y[0 * inc_y];            \
505         res1 = y[1 * inc_y];            \
506                                         \
507         res0 += alpha * temp0;          \
508         res1 += alpha * temp1;          \
509                                         \
510         y[0 * inc_y] = res0;            \
511         y[1 * inc_y] = res1;            \
512                                         \
513         y += 2 * inc_y;                 \
514                                         \
515         pa0 += 2 * lda;                 \
516         pa1 += 2 * lda;                 \
517     }                                   \
518                                         \
519     if (n & 1)                          \
520     {                                   \
521         temp0 = 0.0;                    \
522                                         \
523         k = 0;                          \
524         x = srcx_org;                   \
525                                         \
526         for (i = m; i--;)               \
527         {                               \
528             temp0 += pa0[k] * x[0];     \
529             x += inc_x;                 \
530             k++;                        \
531         }                               \
532                                         \
533         y[0] += alpha * temp0;          \
534         y += inc_y;                     \
535         pa0 += lda;                     \
536     }
537
538
539 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
540           BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
541           FLOAT *buffer)
542 {
543     BLASLONG i, j, k;
544     FLOAT *srcx_org = x;
545     FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
546     FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
547     FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
548     v2f64 x0, x1, x2, x3;
549     v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
550     v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
551     v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
552     v2f64 zero = {0};
553
554     pa0 = A + 0 * lda;
555     pa1 = A + 1 * lda;
556     pa2 = A + 2 * lda;
557     pa3 = A + 3 * lda;
558     pa4 = A + 4 * lda;
559     pa5 = A + 5 * lda;
560     pa6 = A + 6 * lda;
561     pa7 = A + 7 * lda;
562
563     if (1 == inc_x)
564     {
565         #define DLOAD_X8  DLOAD_X8_VECTOR
566         #define DLOAD_X4  DLOAD_X4_VECTOR
567         #define DLOAD_X2  DLOAD_X2_VECTOR
568
569         DGEMV_T_MSA();
570
571         #undef DLOAD_X8
572         #undef DLOAD_X4
573         #undef DLOAD_X2
574     }
575     else
576     {
577         #define DLOAD_X8  DLOAD_X8_GP
578         #define DLOAD_X4  DLOAD_X4_GP
579         #define DLOAD_X2  DLOAD_X2_GP
580
581         DGEMV_T_MSA();
582
583         #undef DLOAD_X8
584         #undef DLOAD_X4
585         #undef DLOAD_X2
586     }
587
588     return(0);
589 }