1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define SGEMV_T_8x8() \
33 LD_SP2(pa0 + k, 4, t0, t1); \
34 LD_SP2(pa1 + k, 4, t2, t3); \
35 LD_SP2(pa2 + k, 4, t4, t5); \
36 LD_SP2(pa3 + k, 4, t6, t7); \
37 LD_SP2(pa4 + k, 4, t8, t9); \
38 LD_SP2(pa5 + k, 4, t10, t11); \
39 LD_SP2(pa6 + k, 4, t12, t13); \
40 LD_SP2(pa7 + k, 4, t14, t15); \
67 #define SGEMV_T_8x4() \
69 t0 = LD_SP(pa0 + k); \
70 t2 = LD_SP(pa1 + k); \
71 t4 = LD_SP(pa2 + k); \
72 t6 = LD_SP(pa3 + k); \
73 t8 = LD_SP(pa4 + k); \
74 t10 = LD_SP(pa5 + k); \
75 t12 = LD_SP(pa6 + k); \
76 t14 = LD_SP(pa7 + k); \
88 #define SGEMV_T_4x8() \
90 LD_SP2(pa0 + k, 4, t0, t1); \
91 LD_SP2(pa1 + k, 4, t2, t3); \
92 LD_SP2(pa2 + k, 4, t4, t5); \
93 LD_SP2(pa3 + k, 4, t6, t7); \
108 #define SGEMV_T_4x4() \
110 t0 = LD_SP(pa0 + k); \
111 t2 = LD_SP(pa1 + k); \
112 t4 = LD_SP(pa2 + k); \
113 t6 = LD_SP(pa3 + k); \
121 #define SGEMV_T_2x8() \
123 LD_SP2(pa0 + k, 4, t0, t1); \
124 LD_SP2(pa1 + k, 4, t2, t3); \
133 #define SGEMV_T_2x4() \
135 t0 = LD_SP(pa0 + k); \
136 t2 = LD_SP(pa1 + k); \
142 #define SLOAD_X8_GP() \
143 x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \
144 x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \
145 x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \
146 x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \
147 x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x))); \
148 x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *)(x + 5 * inc_x))); \
149 x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *)(x + 6 * inc_x))); \
150 x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *)(x + 7 * inc_x))); \
152 #define SLOAD_X4_GP() \
153 x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \
154 x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \
155 x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \
156 x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \
158 #define SLOAD_X8_VECTOR() LD_SP2(x, 4, x0, x1);
159 #define SLOAD_X4_VECTOR() x0 = LD_SP(x);
161 #define SGEMV_T_MSA() \
162 for (j = (n >> 3); j--;) \
176 for (i = (m >> 3); i--;) \
194 TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \
195 tp0, tp1, tp2, tp3); \
196 TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7, \
197 tp4, tp5, tp6, tp7); \
214 for (i = (m & 3); i--;) \
216 temp0 += pa0[k] * x[0]; \
217 temp1 += pa1[k] * x[0]; \
218 temp2 += pa2[k] * x[0]; \
219 temp3 += pa3[k] * x[0]; \
220 temp4 += pa4[k] * x[0]; \
221 temp5 += pa5[k] * x[0]; \
222 temp6 += pa6[k] * x[0]; \
223 temp7 += pa7[k] * x[0]; \
229 res0 = y[0 * inc_y]; \
230 res1 = y[1 * inc_y]; \
231 res2 = y[2 * inc_y]; \
232 res3 = y[3 * inc_y]; \
233 res4 = y[4 * inc_y]; \
234 res5 = y[5 * inc_y]; \
235 res6 = y[6 * inc_y]; \
236 res7 = y[7 * inc_y]; \
238 res0 += alpha * temp0; \
239 res1 += alpha * temp1; \
240 res2 += alpha * temp2; \
241 res3 += alpha * temp3; \
242 res4 += alpha * temp4; \
243 res5 += alpha * temp5; \
244 res6 += alpha * temp6; \
245 res7 += alpha * temp7; \
247 y[0 * inc_y] = res0; \
248 y[1 * inc_y] = res1; \
249 y[2 * inc_y] = res2; \
250 y[3 * inc_y] = res3; \
251 y[4 * inc_y] = res4; \
252 y[5 * inc_y] = res5; \
253 y[6 * inc_y] = res6; \
254 y[7 * inc_y] = res7; \
278 for (i = (m >> 3); i--;) \
296 TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \
297 tp0, tp1, tp2, tp3); \
307 for (i = (m & 3); i--;) \
309 temp0 += pa0[k] * x[0]; \
310 temp1 += pa1[k] * x[0]; \
311 temp2 += pa2[k] * x[0]; \
312 temp3 += pa3[k] * x[0]; \
318 res0 = y[0 * inc_y]; \
319 res1 = y[1 * inc_y]; \
320 res2 = y[2 * inc_y]; \
321 res3 = y[3 * inc_y]; \
323 res0 += alpha * temp0; \
324 res1 += alpha * temp1; \
325 res2 += alpha * temp2; \
326 res3 += alpha * temp3; \
328 y[0 * inc_y] = res0; \
329 y[1 * inc_y] = res1; \
330 y[2 * inc_y] = res2; \
331 y[3 * inc_y] = res3; \
349 for (i = (m >> 3); i--;) \
367 ILVRL_W2_SP(tp1, tp0, tp2, tp3); \
371 temp0 = tp2[0] + tp2[2]; \
372 temp1 = tp2[1] + tp2[3]; \
374 for (i = (m & 3); i--;) \
376 temp0 += pa0[k] * x[0]; \
377 temp1 += pa1[k] * x[0]; \
383 res0 = y[0 * inc_y]; \
384 res1 = y[1 * inc_y]; \
386 res0 += alpha * temp0; \
387 res1 += alpha * temp1; \
389 y[0 * inc_y] = res0; \
390 y[1 * inc_y] = res1; \
407 temp0 += pa0[k] * x[0]; \
413 y[0] += alpha * temp0; \
418 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
419 BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
424 FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
425 FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
426 FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
428 v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
429 v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
443 #define SLOAD_X8 SLOAD_X8_VECTOR
444 #define SLOAD_X4 SLOAD_X4_VECTOR
453 #define SLOAD_X8 SLOAD_X8_GP
454 #define SLOAD_X4 SLOAD_X4_GP