fix build error
[platform/upstream/openblas.git] / kernel / mips / sgemv_n_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #define SGEMV_N_8x8()              \
32 {                                  \
33     LD_SP2(pa0 + k, 4, t0, t1);    \
34     LD_SP2(pa1 + k, 4, t2, t3);    \
35     LD_SP2(pa2 + k, 4, t4, t5);    \
36     LD_SP2(pa3 + k, 4, t6, t7);    \
37     LD_SP2(pa4 + k, 4, t8, t9);    \
38     LD_SP2(pa5 + k, 4, t10, t11);  \
39     LD_SP2(pa6 + k, 4, t12, t13);  \
40     LD_SP2(pa7 + k, 4, t14, t15);  \
41                                    \
42     y0 += tp0 * t0;                \
43     y1 += tp0 * t1;                \
44                                    \
45     y0 += tp1 * t2;                \
46     y1 += tp1 * t3;                \
47                                    \
48     y0 += tp2 * t4;                \
49     y1 += tp2 * t5;                \
50                                    \
51     y0 += tp3 * t6;                \
52     y1 += tp3 * t7;                \
53                                    \
54     y0 += tp4 * t8;                \
55     y1 += tp4 * t9;                \
56                                    \
57     y0 += tp5 * t10;               \
58     y1 += tp5 * t11;               \
59                                    \
60     y0 += tp6 * t12;               \
61     y1 += tp6 * t13;               \
62                                    \
63     y0 += tp7 * t14;               \
64     y1 += tp7 * t15;               \
65 }
66
67 #define SGEMV_N_4x8()      \
68 {                          \
69     t0  = LD_SP(pa0 + k);  \
70     t2  = LD_SP(pa1 + k);  \
71     t4  = LD_SP(pa2 + k);  \
72     t6  = LD_SP(pa3 + k);  \
73     t8  = LD_SP(pa4 + k);  \
74     t10 = LD_SP(pa5 + k);  \
75     t12 = LD_SP(pa6 + k);  \
76     t14 = LD_SP(pa7 + k);  \
77                            \
78     y0 += tp0 * t0;        \
79     y0 += tp1 * t2;        \
80     y0 += tp2 * t4;        \
81     y0 += tp3 * t6;        \
82     y0 += tp4 * t8;        \
83     y0 += tp5 * t10;       \
84     y0 += tp6 * t12;       \
85     y0 += tp7 * t14;       \
86 }
87
88 #define SGEMV_N_8x4()            \
89 {                                \
90     LD_SP2(pa0 + k, 4, t0, t1);  \
91     LD_SP2(pa1 + k, 4, t2, t3);  \
92     LD_SP2(pa2 + k, 4, t4, t5);  \
93     LD_SP2(pa3 + k, 4, t6, t7);  \
94                                  \
95     y0 += tp0 * t0;              \
96     y1 += tp0 * t1;              \
97                                  \
98     y0 += tp1 * t2;              \
99     y1 += tp1 * t3;              \
100                                  \
101     y0 += tp2 * t4;              \
102     y1 += tp2 * t5;              \
103                                  \
104     y0 += tp3 * t6;              \
105     y1 += tp3 * t7;              \
106 }
107
108 #define SGEMV_N_4x4()      \
109 {                          \
110     t0  = LD_SP(pa0 + k);  \
111     t2  = LD_SP(pa1 + k);  \
112     t4  = LD_SP(pa2 + k);  \
113     t6  = LD_SP(pa3 + k);  \
114                            \
115     y0 += tp0 * t0;        \
116     y0 += tp1 * t2;        \
117     y0 += tp2 * t4;        \
118     y0 += tp3 * t6;        \
119 }
120
121 #define SGEMV_N_8x2()            \
122 {                                \
123     LD_SP2(pa0 + k, 4, t0, t1);  \
124     LD_SP2(pa1 + k, 4, t2, t3);  \
125                                  \
126     y0 += tp0 * t0;              \
127     y1 += tp0 * t1;              \
128                                  \
129     y0 += tp1 * t2;              \
130     y1 += tp1 * t3;              \
131 }
132
133 #define SGEMV_N_4x2()      \
134 {                          \
135     t0  = LD_SP(pa0 + k);  \
136     t2  = LD_SP(pa1 + k);  \
137                            \
138     y0 += tp0 * t0;        \
139     y0 += tp1 * t2;        \
140 }
141
142 #define SLOAD_X8_SCALE_GP()             \
143     temp0 = alpha * x[0 * inc_x];       \
144     temp1 = alpha * x[1 * inc_x];       \
145     temp2 = alpha * x[2 * inc_x];       \
146     temp3 = alpha * x[3 * inc_x];       \
147     temp4 = alpha * x[4 * inc_x];       \
148     temp5 = alpha * x[5 * inc_x];       \
149     temp6 = alpha * x[6 * inc_x];       \
150     temp7 = alpha * x[7 * inc_x];       \
151                                         \
152     tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
153     tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
154     tp2 = COPY_FLOAT_TO_VECTOR(temp2);  \
155     tp3 = COPY_FLOAT_TO_VECTOR(temp3);  \
156     tp4 = COPY_FLOAT_TO_VECTOR(temp4);  \
157     tp5 = COPY_FLOAT_TO_VECTOR(temp5);  \
158     tp6 = COPY_FLOAT_TO_VECTOR(temp6);  \
159     tp7 = COPY_FLOAT_TO_VECTOR(temp7);  \
160
161 #define SLOAD_X4_SCALE_GP()             \
162     temp0 = alpha * x[0 * inc_x];       \
163     temp1 = alpha * x[1 * inc_x];       \
164     temp2 = alpha * x[2 * inc_x];       \
165     temp3 = alpha * x[3 * inc_x];       \
166                                         \
167     tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
168     tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
169     tp2 = COPY_FLOAT_TO_VECTOR(temp2);  \
170     tp3 = COPY_FLOAT_TO_VECTOR(temp3);  \
171
172 #define SLOAD_X8_SCALE_VECTOR()            \
173     LD_SP2(x, 4, x0, x1);                  \
174                                            \
175     x0 = x0 * v_alpha;                     \
176     x1 = x1 * v_alpha;                     \
177                                            \
178     SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3);  \
179     SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7);  \
180
181 #define SLOAD_X4_SCALE_VECTOR()            \
182     x0 = LD_SP(x);                         \
183     x0 = x0 * v_alpha;                     \
184     SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3);  \
185
186 #define SLOAD_Y8_GP()                                                        \
187     y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y)));  \
188     y0 = (v4f32) __msa_insert_w((v4i32) y0,  1, *((int *)(y + 1 * inc_y)));  \
189     y0 = (v4f32) __msa_insert_w((v4i32) y0,  2, *((int *)(y + 2 * inc_y)));  \
190     y0 = (v4f32) __msa_insert_w((v4i32) y0,  3, *((int *)(y + 3 * inc_y)));  \
191     y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y)));  \
192     y1 = (v4f32) __msa_insert_w((v4i32) y1,  1, *((int *)(y + 5 * inc_y)));  \
193     y1 = (v4f32) __msa_insert_w((v4i32) y1,  2, *((int *)(y + 6 * inc_y)));  \
194     y1 = (v4f32) __msa_insert_w((v4i32) y1,  3, *((int *)(y + 7 * inc_y)));  \
195
196 #define SLOAD_Y4_GP()                                                        \
197     y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y)));  \
198     y0 = (v4f32) __msa_insert_w((v4i32) y0,  1, *((int *)(y + 1 * inc_y)));  \
199     y0 = (v4f32) __msa_insert_w((v4i32) y0,  2, *((int *)(y + 2 * inc_y)));  \
200     y0 = (v4f32) __msa_insert_w((v4i32) y0,  3, *((int *)(y + 3 * inc_y)));  \
201
202 #define SLOAD_Y8_VECTOR()  LD_SP2(y, 4, y0, y1);
203 #define SLOAD_Y4_VECTOR()  y0 = LD_SP(y);
204
205 #define SSTORE_Y8_GP()                                          \
206     *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0);  \
207     *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1);  \
208     *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2);  \
209     *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3);  \
210     *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0);  \
211     *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1);  \
212     *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2);  \
213     *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3);  \
214
215 #define SSTORE_Y4_GP()                                          \
216     *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0);  \
217     *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1);  \
218     *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2);  \
219     *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3);  \
220
221 #define SSTORE_Y8_VECTOR()  ST_SP2(y0, y1, y, 4);
222 #define SSTORE_Y4_VECTOR()  ST_SP(y0, y);
223
224 #define SGEMV_N_MSA()                       \
225     for (j = (n >> 3); j--;)                \
226     {                                       \
227         SLOAD_X8_SCALE();                   \
228                                             \
229         k = 0;                              \
230         y = y_org;                          \
231                                             \
232         for (i = (m >> 3); i--;)            \
233         {                                   \
234             SLOAD_Y8();                     \
235             SGEMV_N_8x8();                  \
236             SSTORE_Y8();                    \
237                                             \
238             y += 8 * inc_y;                 \
239             k += 8;                         \
240         }                                   \
241                                             \
242         if (m & 4)                          \
243         {                                   \
244             SLOAD_Y4();                     \
245             SGEMV_N_4x8();                  \
246             SSTORE_Y4();                    \
247                                             \
248             y += 4 * inc_y;                 \
249             k += 4;                         \
250         }                                   \
251                                             \
252         if (m & 3)                          \
253         {                                   \
254             temp0 = alpha * x[0 * inc_x];   \
255             temp1 = alpha * x[1 * inc_x];   \
256             temp2 = alpha * x[2 * inc_x];   \
257             temp3 = alpha * x[3 * inc_x];   \
258             temp4 = alpha * x[4 * inc_x];   \
259             temp5 = alpha * x[5 * inc_x];   \
260             temp6 = alpha * x[6 * inc_x];   \
261             temp7 = alpha * x[7 * inc_x];   \
262                                             \
263             for (i = (m & 3); i--;)         \
264             {                               \
265                 temp = y[0];                \
266                 temp += temp0 * pa0[k];     \
267                 temp += temp1 * pa1[k];     \
268                 temp += temp2 * pa2[k];     \
269                 temp += temp3 * pa3[k];     \
270                 temp += temp4 * pa4[k];     \
271                 temp += temp5 * pa5[k];     \
272                 temp += temp6 * pa6[k];     \
273                 temp += temp7 * pa7[k];     \
274                 y[0] = temp;                \
275                                             \
276                 y += inc_y;                 \
277                 k++;                        \
278             }                               \
279         }                                   \
280         pa0 += 8 * lda;                     \
281         pa1 += 8 * lda;                     \
282         pa2 += 8 * lda;                     \
283         pa3 += 8 * lda;                     \
284         pa4 += 8 * lda;                     \
285         pa5 += 8 * lda;                     \
286         pa6 += 8 * lda;                     \
287         pa7 += 8 * lda;                     \
288                                             \
289         x += 8 * inc_x;                     \
290     }                                       \
291                                             \
292     if (n & 4)                              \
293     {                                       \
294         SLOAD_X4_SCALE();                   \
295                                             \
296         k = 0;                              \
297         y = y_org;                          \
298                                             \
299         for (i = (m >> 3); i--;)            \
300         {                                   \
301             SLOAD_Y8();                     \
302             SGEMV_N_8x4();                  \
303             SSTORE_Y8();                    \
304                                             \
305             y += 8 * inc_y;                 \
306             k += 8;                         \
307         }                                   \
308                                             \
309         if (m & 4)                          \
310         {                                   \
311             SLOAD_Y4();                     \
312             SGEMV_N_4x4();                  \
313             SSTORE_Y4();                    \
314                                             \
315             y += 4 * inc_y;                 \
316             k += 4;                         \
317         }                                   \
318                                             \
319         if (m & 3)                          \
320         {                                   \
321             temp0 = alpha * x[0 * inc_x];   \
322             temp1 = alpha * x[1 * inc_x];   \
323             temp2 = alpha * x[2 * inc_x];   \
324             temp3 = alpha * x[3 * inc_x];   \
325                                             \
326             for (i = (m & 3); i--;)         \
327             {                               \
328                 temp = y[0];                \
329                 temp += temp0 * pa0[k];     \
330                 temp += temp1 * pa1[k];     \
331                 temp += temp2 * pa2[k];     \
332                 temp += temp3 * pa3[k];     \
333                 y[0] = temp;                \
334                                             \
335                 y += inc_y;                 \
336                 k++;                        \
337             }                               \
338         }                                   \
339                                             \
340         pa0 += 4 * lda;                     \
341         pa1 += 4 * lda;                     \
342         pa2 += 4 * lda;                     \
343         pa3 += 4 * lda;                     \
344                                             \
345         x += 4 * inc_x;                     \
346     }                                       \
347                                             \
348     if (n & 2)                              \
349     {                                       \
350         temp0 = alpha * x[0 * inc_x];       \
351         temp1 = alpha * x[1 * inc_x];       \
352                                             \
353         tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
354         tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
355                                             \
356         k = 0;                              \
357         y = y_org;                          \
358                                             \
359         for (i = (m >> 3); i--;)            \
360         {                                   \
361             SLOAD_Y8();                     \
362             SGEMV_N_8x2();                  \
363             SSTORE_Y8();                    \
364                                             \
365             y += 8 * inc_y;                 \
366             k += 8;                         \
367         }                                   \
368                                             \
369         if (m & 4)                          \
370         {                                   \
371             SLOAD_Y4();                     \
372             SGEMV_N_4x2();                  \
373             SSTORE_Y4();                    \
374                                             \
375             y += 4 * inc_y;                 \
376             k += 4;                         \
377         }                                   \
378                                             \
379         if (m & 3)                          \
380         {                                   \
381             temp0 = alpha * x[0 * inc_x];   \
382             temp1 = alpha * x[1 * inc_x];   \
383                                             \
384             for (i = (m & 3); i--;)         \
385             {                               \
386                 temp = y[0];                \
387                 temp += temp0 * pa0[k];     \
388                 temp += temp1 * pa1[k];     \
389                 y[0] = temp;                \
390                                             \
391                 y += inc_y;                 \
392                 k++;                        \
393             }                               \
394         }                                   \
395                                             \
396         pa0 += 2 * lda;                     \
397         pa1 += 2 * lda;                     \
398                                             \
399         x += 2 * inc_x;                     \
400     }                                       \
401                                             \
402     if (n & 1)                              \
403     {                                       \
404         temp = alpha * x[0];                \
405                                             \
406         k = 0;                              \
407         y = y_org;                          \
408                                             \
409         for (i = m; i--;)                   \
410         {                                   \
411            y[0] += temp * pa0[k];           \
412                                             \
413            y += inc_y;                      \
414            k++;                             \
415         }                                   \
416     }                                       \
417
418 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
419           BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
420           FLOAT *buffer)
421 {
422     BLASLONG i, j, k;
423     FLOAT *y_org = y;
424     FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
425     FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
426     v4f32 v_alpha, x0, x1, y0, y1;
427     v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
428     v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
429
430     v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
431
432     pa0 = A;
433     pa1 = A + lda;
434     pa2 = A + 2 * lda;
435     pa3 = A + 3 * lda;
436     pa4 = A + 4 * lda;
437     pa5 = A + 5 * lda;
438     pa6 = A + 6 * lda;
439     pa7 = A + 7 * lda;
440
441     if ((1 == inc_x) && (1 == inc_y))
442     {
443         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_VECTOR
444         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_VECTOR
445         #define SLOAD_Y8   SLOAD_Y8_VECTOR
446         #define SLOAD_Y4   SLOAD_Y4_VECTOR
447         #define SSTORE_Y8  SSTORE_Y8_VECTOR
448         #define SSTORE_Y4  SSTORE_Y4_VECTOR
449
450         SGEMV_N_MSA();
451
452         #undef SLOAD_X8_SCALE
453         #undef SLOAD_X4_SCALE
454         #undef SLOAD_Y8
455         #undef SLOAD_Y4
456         #undef SSTORE_Y8
457         #undef SSTORE_Y4
458     }
459     else if (1 == inc_y)
460     {
461         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_GP
462         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_GP
463         #define SLOAD_Y8   SLOAD_Y8_VECTOR
464         #define SLOAD_Y4   SLOAD_Y4_VECTOR
465         #define SSTORE_Y8  SSTORE_Y8_VECTOR
466         #define SSTORE_Y4  SSTORE_Y4_VECTOR
467
468         SGEMV_N_MSA();
469
470         #undef SLOAD_X8_SCALE
471         #undef SLOAD_X4_SCALE
472         #undef SLOAD_Y8
473         #undef SLOAD_Y4
474         #undef SSTORE_Y8
475         #undef SSTORE_Y4
476     }
477     else if (1 == inc_x)
478     {
479         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_VECTOR
480         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_VECTOR
481         #define SLOAD_Y8   SLOAD_Y8_GP
482         #define SLOAD_Y4   SLOAD_Y4_GP
483         #define SSTORE_Y8  SSTORE_Y8_GP
484         #define SSTORE_Y4  SSTORE_Y4_GP
485
486         SGEMV_N_MSA();
487
488         #undef SLOAD_X8_SCALE
489         #undef SLOAD_X4_SCALE
490         #undef SLOAD_Y8
491         #undef SLOAD_Y4
492         #undef SSTORE_Y8
493         #undef SSTORE_Y4
494     }
495     else
496     {
497         #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_GP
498         #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_GP
499         #define SLOAD_Y8   SLOAD_Y8_GP
500         #define SLOAD_Y4   SLOAD_Y4_GP
501         #define SSTORE_Y8  SSTORE_Y8_GP
502         #define SSTORE_Y4  SSTORE_Y4_GP
503
504         SGEMV_N_MSA();
505
506         #undef SLOAD_X8_SCALE
507         #undef SLOAD_X4_SCALE
508         #undef SLOAD_Y8
509         #undef SLOAD_Y4
510         #undef SSTORE_Y8
511         #undef SSTORE_Y4
512     }
513
514     return(0);
515 }