1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
35 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
45 #define CGEMV_T_8x4() \
46 LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
47 LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
48 LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
49 LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
51 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
52 PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
53 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
54 PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
55 PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
56 PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
57 PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
58 PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
60 tp0r += src0r * x0r; \
61 tp0r += src1r * x1r; \
62 tp0r OP0 src0i * x0i; \
63 tp0r OP0 src1i * x1i; \
65 tp1r += src2r * x0r; \
66 tp1r += src3r * x1r; \
67 tp1r OP0 src2i * x0i; \
68 tp1r OP0 src3i * x1i; \
70 tp2r += src4r * x0r; \
71 tp2r += src5r * x1r; \
72 tp2r OP0 src4i * x0i; \
73 tp2r OP0 src5i * x1i; \
75 tp3r += src6r * x0r; \
76 tp3r += src7r * x1r; \
77 tp3r OP0 src6i * x0i; \
78 tp3r OP0 src7i * x1i; \
80 tp0i OP1 src0r * x0i; \
81 tp0i OP1 src1r * x1i; \
82 tp0i OP2 src0i * x0r; \
83 tp0i OP2 src1i * x1r; \
85 tp1i OP1 src2r * x0i; \
86 tp1i OP1 src3r * x1i; \
87 tp1i OP2 src2i * x0r; \
88 tp1i OP2 src3i * x1r; \
90 tp2i OP1 src4r * x0i; \
91 tp2i OP1 src5r * x1i; \
92 tp2i OP2 src4i * x0r; \
93 tp2i OP2 src5i * x1r; \
95 tp3i OP1 src6r * x0i; \
96 tp3i OP1 src7r * x1i; \
97 tp3i OP2 src6i * x0r; \
98 tp3i OP2 src7i * x1r; \
100 #define CGEMV_T_8x2() \
101 LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
102 LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
104 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
105 PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
106 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
107 PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
109 tp0r += src0r * x0r; \
110 tp0r += src1r * x1r; \
111 tp0r OP0 src0i * x0i; \
112 tp0r OP0 src1i * x1i; \
114 tp1r += src2r * x0r; \
115 tp1r += src3r * x1r; \
116 tp1r OP0 src2i * x0i; \
117 tp1r OP0 src3i * x1i; \
119 tp0i OP1 src0r * x0i; \
120 tp0i OP1 src1r * x1i; \
121 tp0i OP2 src0i * x0r; \
122 tp0i OP2 src1i * x1r; \
124 tp1i OP1 src2r * x0i; \
125 tp1i OP1 src3r * x1i; \
126 tp1i OP2 src2i * x0r; \
127 tp1i OP2 src3i * x1r; \
129 #define CGEMV_T_8x1() \
130 LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
132 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
133 PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
135 tp0r += src0r * x0r; \
136 tp0r += src1r * x1r; \
137 tp0r OP0 src0i * x0i; \
138 tp0r OP0 src1i * x1i; \
140 tp0i OP1 src0r * x0i; \
141 tp0i OP1 src1r * x1i; \
142 tp0i OP2 src0i * x0r; \
143 tp0i OP2 src1i * x1r; \
145 #define CGEMV_T_4x4() \
146 LD_SP2(pa0 + k, 4, t0, t1); \
147 LD_SP2(pa1 + k, 4, t4, t5); \
148 LD_SP2(pa2 + k, 4, t8, t9); \
149 LD_SP2(pa3 + k, 4, t12, t13); \
151 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
152 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
153 PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
154 PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
156 tp0r += src0r * x0r; \
157 tp0r OP0 src0i * x0i; \
159 tp1r += src2r * x0r; \
160 tp1r OP0 src2i * x0i; \
162 tp2r += src4r * x0r; \
163 tp2r OP0 src4i * x0i; \
165 tp3r += src6r * x0r; \
166 tp3r OP0 src6i * x0i; \
168 tp0i OP1 src0r * x0i; \
169 tp0i OP2 src0i * x0r; \
171 tp1i OP1 src2r * x0i; \
172 tp1i OP2 src2i * x0r; \
174 tp2i OP1 src4r * x0i; \
175 tp2i OP2 src4i * x0r; \
177 tp3i OP1 src6r * x0i; \
178 tp3i OP2 src6i * x0r; \
180 #define CGEMV_T_4x2() \
181 LD_SP2(pa0 + k, 4, t0, t1); \
182 LD_SP2(pa1 + k, 4, t4, t5); \
184 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
185 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
187 tp0r += src0r * x0r; \
188 tp0r OP0 src0i * x0i; \
190 tp1r += src2r * x0r; \
191 tp1r OP0 src2i * x0i; \
193 tp0i OP1 src0r * x0i; \
194 tp0i OP2 src0i * x0r; \
196 tp1i OP1 src2r * x0i; \
197 tp1i OP2 src2i * x0r; \
199 #define CGEMV_T_4x1() \
200 LD_SP2(pa0 + k, 4, t0, t1); \
202 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
204 tp0r += src0r * x0r; \
205 tp0r OP0 src0i * x0i; \
207 tp0i OP1 src0r * x0i; \
208 tp0i OP2 src0i * x0r; \
210 #define CGEMV_T_1x4() \
211 temp0r += pa0[k + 0] * x[0 * inc_x2]; \
212 temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
213 temp1r += pa1[k + 0] * x[0 * inc_x2]; \
214 temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
215 temp2r += pa2[k + 0] * x[0 * inc_x2]; \
216 temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \
217 temp3r += pa3[k + 0] * x[0 * inc_x2]; \
218 temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \
220 temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
221 temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
222 temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
223 temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
224 temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \
225 temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \
226 temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \
227 temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \
229 #define CGEMV_T_1x2() \
230 temp0r += pa0[k + 0] * x[0 * inc_x2]; \
231 temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
232 temp1r += pa1[k + 0] * x[0 * inc_x2]; \
233 temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
235 temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
236 temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
237 temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
238 temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
240 #define CGEMV_T_1x1() \
241 temp0r += pa0[k + 0] * x[0 * inc_x2]; \
242 temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
244 temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
245 temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
247 #define CSCALE_STORE_Y4_GP() \
248 res0r = y[0 * inc_y2]; \
249 res1r = y[1 * inc_y2]; \
250 res2r = y[2 * inc_y2]; \
251 res3r = y[3 * inc_y2]; \
253 res0i = y[0 * inc_y2 + 1]; \
254 res1i = y[1 * inc_y2 + 1]; \
255 res2i = y[2 * inc_y2 + 1]; \
256 res3i = y[3 * inc_y2 + 1]; \
258 res0r += alphar * temp0r; \
259 res0r OP0 alphai * temp0i; \
260 res1r += alphar * temp1r; \
261 res1r OP0 alphai * temp1i; \
262 res2r += alphar * temp2r; \
263 res2r OP0 alphai * temp2i; \
264 res3r += alphar * temp3r; \
265 res3r OP0 alphai * temp3i; \
267 res0i OP1 alphar * temp0i; \
268 res0i OP2 alphai * temp0r; \
269 res1i OP1 alphar * temp1i; \
270 res1i OP2 alphai * temp1r; \
271 res2i OP1 alphar * temp2i; \
272 res2i OP2 alphai * temp2r; \
273 res3i OP1 alphar * temp3i; \
274 res3i OP2 alphai * temp3r; \
276 y[0 * inc_y2] = res0r; \
277 y[1 * inc_y2] = res1r; \
278 y[2 * inc_y2] = res2r; \
279 y[3 * inc_y2] = res3r; \
281 y[0 * inc_y2 + 1] = res0i; \
282 y[1 * inc_y2 + 1] = res1i; \
283 y[2 * inc_y2 + 1] = res2i; \
284 y[3 * inc_y2 + 1] = res3i; \
286 #define CSCALE_STORE_Y2_GP() \
287 res0r = y[0 * inc_y2]; \
288 res1r = y[1 * inc_y2]; \
290 res0i = y[0 * inc_y2 + 1]; \
291 res1i = y[1 * inc_y2 + 1]; \
293 res0r += alphar * temp0r; \
294 res0r OP0 alphai * temp0i; \
295 res1r += alphar * temp1r; \
296 res1r OP0 alphai * temp1i; \
298 res0i OP1 alphar * temp0i; \
299 res0i OP2 alphai * temp0r; \
300 res1i OP1 alphar * temp1i; \
301 res1i OP2 alphai * temp1r; \
303 y[0 * inc_y2] = res0r; \
304 y[1 * inc_y2] = res1r; \
306 y[0 * inc_y2 + 1] = res0i; \
307 y[1 * inc_y2 + 1] = res1i; \
310 #define CSCALE_STORE_Y1_GP() \
311 res0r = y[0 * inc_y2]; \
312 res0i = y[0 * inc_y2 + 1]; \
314 res0r += alphar * temp0r; \
315 res0r OP0 alphai * temp0i; \
317 res0i OP1 alphar * temp0i; \
318 res0i OP2 alphai * temp0r; \
320 y[0 * inc_y2] = res0r; \
321 y[0 * inc_y2 + 1] = res0i; \
323 #define CLOAD_X8_VECTOR() \
324 LD_SP4(x, 4, x0, x1, x2, x3); \
325 PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
326 PCKEVOD_W2_SP(x3, x2, x1r, x1i); \
328 #define CLOAD_X4_VECTOR() \
329 LD_SP2(x, 4, x0, x1); \
330 PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
332 #define CLOAD_X8_GP() \
333 x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
334 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
335 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
336 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
337 x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \
338 x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \
339 x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \
340 x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \
341 x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
342 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
343 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
344 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
345 x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \
346 x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \
347 x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \
348 x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \
350 #define CLOAD_X4_GP() \
351 x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
352 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
353 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
354 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
355 x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
356 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
357 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
358 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
360 #define CGEMV_T_MSA() \
361 for (j = (n >> 2); j--;) \
363 tp0r = tp1r = tp2r = tp3r = zero; \
364 tp0i = tp1i = tp2i = tp3i = zero; \
367 k_pref = pref_offset; \
370 for (i = (m >> 3); i--;) \
372 PREFETCH(pa0 + k_pref + 16 + 0); \
373 PREFETCH(pa0 + k_pref + 16 + 8); \
374 PREFETCH(pa1 + k_pref + 16 + 0); \
375 PREFETCH(pa1 + k_pref + 16 + 8); \
376 PREFETCH(pa2 + k_pref + 16 + 0); \
377 PREFETCH(pa2 + k_pref + 16 + 8); \
378 PREFETCH(pa3 + k_pref + 16 + 0); \
379 PREFETCH(pa3 + k_pref + 16 + 8); \
399 TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \
400 tp0r, tp1r, tp2r, tp3r); \
401 TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \
402 tp0i, tp1i, tp2i, tp3i); \
420 for (i = (m & 3); i--;) \
428 CSCALE_STORE_Y4_GP(); \
439 tp0r = tp1r = zero; \
440 tp0i = tp1i = zero; \
445 for (i = (m >> 3); i--;) \
465 TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \
466 tp0r, tp1r, tp0i, tp1i); \
477 for (i = (m & 3); i--;) \
485 CSCALE_STORE_Y2_GP(); \
500 for (i = (m >> 3); i--;) \
520 ILVRL_W2_SP(tp0i, tp0r, t0, t1); \
524 temp0r = t0[0] + t0[2]; \
525 temp0i = t0[1] + t0[3]; \
527 for (i = (m & 3); i--;) \
535 CSCALE_STORE_Y1_GP(); \
541 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
542 FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
543 BLASLONG inc_y, FLOAT *buffer)
545 BLASLONG i, j, k, k_pref, pref_offset;
546 FLOAT *pa0, *pa1, *pa2, *pa3;
548 FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
549 FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
550 BLASLONG inc_x2, inc_y2, lda2;
552 v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
553 v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
554 v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
555 v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
556 v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
560 pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1);
561 pref_offset = L1_DATA_LINESIZE - pref_offset;
562 pref_offset = pref_offset / sizeof(FLOAT);
574 #define CLOAD_X8 CLOAD_X8_VECTOR
575 #define CLOAD_X4 CLOAD_X4_VECTOR
584 #define CLOAD_X8 CLOAD_X8_GP
585 #define CLOAD_X4 CLOAD_X4_GP