1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
37 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
47 #define ZGEMV_T_8x1() \
48 LD_DP4(pa0, 2, t0, t1, t2, t3); \
49 LD_DP4(pa0 + 8, 2, t4, t5, t6, t7); \
51 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
52 PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
53 PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
54 PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
56 tp0r += src0r * x0r; \
57 tp0i OP1 src0r * x0i; \
58 tp0r OP0 src0i * x0i; \
59 tp0i OP2 src0i * x0r; \
61 tp0r += src2r * x2r; \
62 tp0i OP1 src2r * x2i; \
63 tp0r OP0 src2i * x2i; \
64 tp0i OP2 src2i * x2r; \
66 tp0r += src1r * x1r; \
67 tp0i OP1 src1r * x1i; \
68 tp0r OP0 src1i * x1i; \
69 tp0i OP2 src1i * x1r; \
71 tp0r += src3r * x3r; \
72 tp0i OP1 src3r * x3i; \
73 tp0r OP0 src3i * x3i; \
74 tp0i OP2 src3i * x3r; \
76 #define ZGEMV_T_4x1() \
77 LD_DP4(pa0, 2, t0, t1, t2, t3); \
79 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
80 PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
82 tp0r += src0r * x0r; \
83 tp0r += src1r * x1r; \
84 tp0r OP0 src0i * x0i; \
85 tp0r OP0 src1i * x1i; \
87 tp0i OP1 src0r * x0i; \
88 tp0i OP1 src1r * x1i; \
89 tp0i OP2 src0i * x0r; \
90 tp0i OP2 src1i * x1r; \
92 #define ZGEMV_T_2x1() \
93 LD_DP2(pa0, 2, t0, t1); \
95 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
97 tp0r += src0r * x0r; \
98 tp0r OP0 src0i * x0i; \
100 tp0i OP1 src0r * x0i; \
101 tp0i OP2 src0i * x0r; \
103 #define ZGEMV_T_1x1() \
104 temp0r += pa0[0] * x[0 * inc_x2]; \
105 temp0r OP0 pa0[1] * x[0 * inc_x2 + 1]; \
107 temp0i OP1 pa0[0] * x[0 * inc_x2 + 1]; \
108 temp0i OP2 pa0[1] * x[0 * inc_x2]; \
110 #define ZSCALE_STORE_Y1_GP() \
111 res0r = y[0 * inc_y2]; \
112 res0i = y[0 * inc_y2 + 1]; \
114 res0r += alphar * temp0r; \
115 res0r OP0 alphai * temp0i; \
117 res0i OP1 alphar * temp0i; \
118 res0i OP2 alphai * temp0r; \
120 y[0 * inc_y2] = res0r; \
121 y[0 * inc_y2 + 1] = res0i; \
123 #define ZLOAD_X8_VECTOR() \
124 LD_DP4(x, 2, x0, x1, x2, x3); \
125 LD_DP4(x + 8, 2, x4, x5, x6, x7); \
127 PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
128 PCKEVOD_D2_DP(x3, x2, x1r, x1i); \
129 PCKEVOD_D2_DP(x5, x4, x2r, x2i); \
130 PCKEVOD_D2_DP(x7, x6, x3r, x3i); \
132 #define ZLOAD_X4_VECTOR() \
133 LD_DP4(x, 2, x0, x1, x2, x3); \
134 PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
135 PCKEVOD_D2_DP(x3, x2, x1r, x1i); \
137 #define ZLOAD_X2_VECTOR() \
138 LD_DP2(x, 2, x0, x1); \
139 PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
141 #define ZLOAD_X8_GP() \
142 x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \
143 x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \
144 x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2))); \
145 x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((long long *) (x + 3 * inc_x2))); \
146 x2r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 4 * inc_x2))); \
147 x2r = (v2f64) __msa_insert_d((v2i64) x2r, 1, *((long long *) (x + 5 * inc_x2))); \
148 x3r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 6 * inc_x2))); \
149 x3r = (v2f64) __msa_insert_d((v2i64) x3r, 1, *((long long *) (x + 7 * inc_x2))); \
150 x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \
151 x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \
152 x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1))); \
153 x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((long long *) (x + 3 * inc_x2 + 1))); \
154 x2i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 4 * inc_x2 + 1))); \
155 x2i = (v2f64) __msa_insert_d((v2i64) x2i, 1, *((long long *) (x + 5 * inc_x2 + 1))); \
156 x3i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 6 * inc_x2 + 1))); \
157 x3i = (v2f64) __msa_insert_d((v2i64) x3i, 1, *((long long *) (x + 7 * inc_x2 + 1))); \
159 #define ZLOAD_X4_GP() \
160 x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \
161 x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \
162 x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2))); \
163 x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((long long *) (x + 3 * inc_x2))); \
164 x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \
165 x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \
166 x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1))); \
167 x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((long long *) (x + 3 * inc_x2 + 1))); \
169 #define ZLOAD_X2_GP() \
170 x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \
171 x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \
172 x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \
173 x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \
175 #define ZGEMV_T_MSA() \
193 x1 = LD_DP(x + 1 * inc_x2); \
195 t1 = LD_DP(pa0 + 2); \
197 x4 = LD_DP(x + 4 * inc_x2); \
198 x5 = LD_DP(x + 5 * inc_x2); \
199 t4 = LD_DP(pa0 + 8); \
200 t5 = LD_DP(pa0 + 10); \
202 for (i = (m >> 4) - 1; i--;) \
204 pa0_pref = pa0 + pref_offset; \
206 PREFETCH(pa0_pref + 36); \
207 PREFETCH(pa0_pref + 44); \
208 PREFETCH(pa0_pref + 48); \
209 PREFETCH(pa0_pref + 52); \
210 PREFETCH(pa0_pref + 56); \
211 PREFETCH(pa0_pref + 60); \
212 PREFETCH(pa0_pref + 64); \
213 PREFETCH(pa0_pref + 72); \
215 x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \
216 x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \
217 src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \
218 src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \
220 tp0r += src0r * x0r; \
221 x2 = LD_DP(x + 2 * inc_x2); \
222 x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \
224 tp0i OP1 src0r * x0i; \
225 x3 = LD_DP(x + 3 * inc_x2); \
226 x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \
228 tp1r OP0 src0i * x0i; \
229 t2 = LD_DP(pa0 + 4); \
230 src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \
232 tp1i OP2 src0i * x0r; \
233 t3 = LD_DP(pa0 + 6); \
234 src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \
236 tp2r += src2r * x2r; \
237 x6 = LD_DP(x + 6 * inc_x2); \
239 tp2i OP1 src2r * x2i; \
240 x7 = LD_DP(x + 7 * inc_x2); \
242 tp3r OP0 src2i * x2i; \
243 t6 = LD_DP(pa0 + 12); \
245 tp3i OP2 src2i * x2r; \
246 t7 = LD_DP(pa0 + 14); \
248 x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \
249 x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \
250 src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \
251 src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \
253 tp0r += src1r * x1r; \
254 x0 = LD_DP(x + 8 * inc_x2); \
255 x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \
257 tp0i OP1 src1r * x1i; \
258 x1 = LD_DP(x + 9 * inc_x2); \
259 x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \
261 tp1r OP0 src1i * x1i; \
262 t0 = LD_DP(pa0 + 16); \
263 src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \
265 tp1i OP2 src1i * x1r; \
266 t1 = LD_DP(pa0 + 18); \
267 src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \
269 tp2r += src3r * x3r; \
270 x4 = LD_DP(x + 12 * inc_x2); \
272 tp2i OP1 src3r * x3i; \
273 x5 = LD_DP(x + 13 * inc_x2); \
275 tp3r OP0 src3i * x3i; \
276 t4 = LD_DP(pa0 + 24); \
278 tp3i OP2 src3i * x3r; \
279 t5 = LD_DP(pa0 + 26); \
281 x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \
282 x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \
283 src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \
284 src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \
286 tp0r += src0r * x0r; \
287 x2 = LD_DP(x + 10 * inc_x2); \
288 x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \
290 tp0i OP1 src0r * x0i; \
291 x3 = LD_DP(x + 11 * inc_x2); \
292 x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \
294 tp1r OP0 src0i * x0i; \
295 t2 = LD_DP(pa0 + 20); \
296 src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \
298 tp1i OP2 src0i * x0r; \
299 t3 = LD_DP(pa0 + 22); \
300 src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \
302 tp2r += src2r * x2r; \
303 x6 = LD_DP(x + 14 * inc_x2); \
305 tp2i OP1 src2r * x2i; \
306 x7 = LD_DP(x + 15 * inc_x2); \
308 tp3r OP0 src2i * x2i; \
309 t6 = LD_DP(pa0 + 28); \
311 tp3i OP2 src2i * x2r; \
312 t7 = LD_DP(pa0 + 30); \
314 x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \
315 x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \
316 src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \
317 src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \
319 tp0r += src1r * x1r; \
320 x0 = LD_DP(x + inc_x2 * 16); \
321 x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \
323 tp0i OP1 src1r * x1i; \
324 x1 = LD_DP(x + inc_x2 * 16 + 1 * inc_x2); \
325 x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \
327 tp1r OP0 src1i * x1i; \
328 t0 = LD_DP(pa0 + 2 * 16); \
329 src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \
331 tp1i OP2 src1i * x1r; \
332 t1 = LD_DP(pa0 + 2 * 16 + 2); \
333 src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \
335 tp2r += src3r * x3r; \
336 x4 = LD_DP(x + inc_x2 * 16 + 4 * inc_x2); \
338 tp2i OP1 src3r * x3i; \
339 x5 = LD_DP(x + inc_x2 * 16 + 5 * inc_x2); \
341 tp3r OP0 src3i * x3i; \
342 t4 = LD_DP(pa0 + 2 * 16 + 8); \
344 tp3i OP2 src3i * x3r; \
345 t5 = LD_DP(pa0 + 2 * 16 + 10); \
351 x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \
352 x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \
353 src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \
354 src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \
356 tp0r += src0r * x0r; \
357 x2 = LD_DP(x + 2 * inc_x2); \
358 x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \
360 tp0i OP1 src0r * x0i; \
361 x3 = LD_DP(x + 3 * inc_x2); \
362 x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \
364 tp1r OP0 src0i * x0i; \
365 t2 = LD_DP(pa0 + 4); \
366 src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \
368 tp1i OP2 src0i * x0r; \
369 t3 = LD_DP(pa0 + 6); \
370 src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \
372 tp2r += src2r * x2r; \
373 x6 = LD_DP(x + 6 * inc_x2); \
375 tp2i OP1 src2r * x2i; \
376 x7 = LD_DP(x + 7 * inc_x2); \
378 tp3r OP0 src2i * x2i; \
379 t6 = LD_DP(pa0 + 12); \
381 tp3i OP2 src2i * x2r; \
382 t7 = LD_DP(pa0 + 14); \
384 x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \
385 x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \
386 src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \
387 src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \
389 tp0r += src1r * x1r; \
390 x0 = LD_DP(x + 8 * inc_x2); \
391 x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \
393 tp0i OP1 src1r * x1i; \
394 x1 = LD_DP(x + 9 * inc_x2); \
395 x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \
397 tp1r OP0 src1i * x1i; \
398 t0 = LD_DP(pa0 + 16); \
399 src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \
401 tp1i OP2 src1i * x1r; \
402 t1 = LD_DP(pa0 + 18); \
403 src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \
405 tp2r += src3r * x3r; \
406 x4 = LD_DP(x + 12 * inc_x2); \
408 tp2i OP1 src3r * x3i; \
409 x5 = LD_DP(x + 13 * inc_x2); \
411 tp3r OP0 src3i * x3i; \
412 t4 = LD_DP(pa0 + 24); \
414 tp3i OP2 src3i * x3r; \
415 t5 = LD_DP(pa0 + 26); \
417 x0r = (v2f64) __msa_pckev_d((v2i64) x1, (v2i64) x0); \
418 x0i = (v2f64) __msa_pckod_d((v2i64) x1, (v2i64) x0); \
419 src0r = (v2f64) __msa_pckev_d((v2i64) t1, (v2i64) t0); \
420 src0i = (v2f64) __msa_pckod_d((v2i64) t1, (v2i64) t0); \
422 tp0r += src0r * x0r; \
423 x2 = LD_DP(x + 10 * inc_x2); \
424 x2r = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); \
426 tp0i OP1 src0r * x0i; \
427 x3 = LD_DP(x + 11 * inc_x2); \
428 x2i = (v2f64) __msa_pckod_d((v2i64) x5, (v2i64) x4); \
430 tp1r OP0 src0i * x0i; \
431 t2 = LD_DP(pa0 + 20); \
432 src2r = (v2f64) __msa_pckev_d((v2i64) t5, (v2i64) t4); \
434 tp1i OP2 src0i * x0r; \
435 t3 = LD_DP(pa0 + 22); \
436 src2i = (v2f64) __msa_pckod_d((v2i64) t5, (v2i64) t4); \
438 tp2r += src2r * x2r; \
439 x6 = LD_DP(x + 14 * inc_x2); \
441 tp2i OP1 src2r * x2i; \
442 x7 = LD_DP(x + 15 * inc_x2); \
444 tp3r OP0 src2i * x2i; \
445 t6 = LD_DP(pa0 + 28); \
447 tp3i OP2 src2i * x2r; \
448 t7 = LD_DP(pa0 + 30); \
450 x1r = (v2f64) __msa_pckev_d((v2i64) x3, (v2i64) x2); \
451 x1i = (v2f64) __msa_pckod_d((v2i64) x3, (v2i64) x2); \
452 src1r = (v2f64) __msa_pckev_d((v2i64) t3, (v2i64) t2); \
453 src1i = (v2f64) __msa_pckod_d((v2i64) t3, (v2i64) t2); \
455 tp0r += src1r * x1r; \
456 x3r = (v2f64) __msa_pckev_d((v2i64) x7, (v2i64) x6); \
458 tp0i OP1 src1r * x1i; \
459 x3i = (v2f64) __msa_pckod_d((v2i64) x7, (v2i64) x6); \
461 tp1r OP0 src1i * x1i; \
462 src3r = (v2f64) __msa_pckev_d((v2i64) t7, (v2i64) t6); \
464 tp1i OP2 src1i * x1r; \
465 src3i = (v2f64) __msa_pckod_d((v2i64) t7, (v2i64) t6); \
467 tp2r += src3r * x3r; \
468 tp2i OP1 src3r * x3i; \
469 tp3r OP0 src3i * x3i; \
470 tp3i OP2 src3i * x3r; \
475 tp0r += tp1r + tp2r + tp3r; \
476 tp0i += tp1i + tp2i + tp3i; \
506 temp0r = tp0r[0] + tp0r[1]; \
507 temp0i = tp0i[0] + tp0i[1]; \
517 ZSCALE_STORE_Y1_GP(); \
523 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
524 FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
525 BLASLONG inc_y, FLOAT *buffer)
527 BLASLONG i, j, pref_offset;
528 BLASLONG inc_x2, inc_y2, lda2;
529 FLOAT *pa0, *pa0_pref;
531 FLOAT temp0r, temp0i;
534 v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
535 v2f64 x4, x5, x6, x7, x2r, x3r, x2i, x3i;
536 v2f64 t0, t1, t2, t3, t4, t5, t6, t7;
537 v2f64 src0r, src1r, src2r, src3r;
538 v2f64 src0i, src1i, src2i, src3i;
539 v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
546 pref_offset = (uintptr_t)A & L1_DATA_LINESIZE;
547 pref_offset = L1_DATA_LINESIZE - pref_offset;
548 pref_offset = pref_offset / sizeof(FLOAT);
552 #define ZLOAD_X8 ZLOAD_X8_VECTOR
553 #define ZLOAD_X4 ZLOAD_X4_VECTOR
554 #define ZLOAD_X2 ZLOAD_X2_VECTOR
564 #define ZLOAD_X8 ZLOAD_X8_GP
565 #define ZLOAD_X4 ZLOAD_X4_GP
566 #define ZLOAD_X2 ZLOAD_X2_GP