1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
67 #define ZGEMV_N_4x4() \
68 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
69 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
70 LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
71 LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
73 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
74 PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
75 PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
76 PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
77 PCKEVOD_D2_DP(t9, t8, src4r, src4i); \
78 PCKEVOD_D2_DP(t11, t10, src5r, src5i); \
79 PCKEVOD_D2_DP(t13, t12, src6r, src6i); \
80 PCKEVOD_D2_DP(t15, t14, src7r, src7i); \
82 y0r += tp0r * src0r; \
83 y1r += tp0r * src1r; \
84 y0r += tp1r * src2r; \
85 y1r += tp1r * src3r; \
86 y0r += tp2r * src4r; \
87 y1r += tp2r * src5r; \
88 y0r += tp3r * src6r; \
89 y1r += tp3r * src7r; \
91 y0r OP0 tp0i * src0i; \
92 y1r OP0 tp0i * src1i; \
93 y0r OP0 tp1i * src2i; \
94 y1r OP0 tp1i * src3i; \
95 y0r OP0 tp2i * src4i; \
96 y1r OP0 tp2i * src5i; \
97 y0r OP0 tp3i * src6i; \
98 y1r OP0 tp3i * src7i; \
100 y0i OP1 tp0r * src0i; \
101 y1i OP1 tp0r * src1i; \
102 y0i OP1 tp1r * src2i; \
103 y1i OP1 tp1r * src3i; \
104 y0i OP1 tp2r * src4i; \
105 y1i OP1 tp2r * src5i; \
106 y0i OP1 tp3r * src6i; \
107 y1i OP1 tp3r * src7i; \
109 y0i OP2 tp0i * src0r; \
110 y1i OP2 tp0i * src1r; \
111 y0i OP2 tp1i * src2r; \
112 y1i OP2 tp1i * src3r; \
113 y0i OP2 tp2i * src4r; \
114 y1i OP2 tp2i * src5r; \
115 y0i OP2 tp3i * src6r; \
116 y1i OP2 tp3i * src7r; \
118 #define ZGEMV_N_2x4() \
119 LD_DP2(pa0 + k, 2, t0, t1); \
120 LD_DP2(pa1 + k, 2, t4, t5); \
121 LD_DP2(pa2 + k, 2, t8, t9); \
122 LD_DP2(pa3 + k, 2, t12, t13); \
124 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
125 PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
126 PCKEVOD_D2_DP(t9, t8, src4r, src4i); \
127 PCKEVOD_D2_DP(t13, t12, src6r, src6i); \
129 y0r += tp0r * src0r; \
130 y0r += tp1r * src2r; \
131 y0r += tp2r * src4r; \
132 y0r += tp3r * src6r; \
134 y0r OP0 tp0i * src0i; \
135 y0r OP0 tp1i * src2i; \
136 y0r OP0 tp2i * src4i; \
137 y0r OP0 tp3i * src6i; \
139 y0i OP1 tp0r * src0i; \
140 y0i OP1 tp1r * src2i; \
141 y0i OP1 tp2r * src4i; \
142 y0i OP1 tp3r * src6i; \
144 y0i OP2 tp0i * src0r; \
145 y0i OP2 tp1i * src2r; \
146 y0i OP2 tp2i * src4r; \
147 y0i OP2 tp3i * src6r; \
149 #define ZGEMV_N_1x4() \
150 res0 = y[0 * inc_y2]; \
151 res1 = y[0 * inc_y2 + 1]; \
153 res0 += temp0_r * pa0[k]; \
154 res0 OP0 temp0_i * pa0[k + 1]; \
155 res0 += temp1_r * pa1[k]; \
156 res0 OP0 temp1_i * pa1[k + 1]; \
157 res0 += temp2_r * pa2[k]; \
158 res0 OP0 temp2_i * pa2[k + 1]; \
159 res0 += temp3_r * pa3[k]; \
160 res0 OP0 temp3_i * pa3[k + 1]; \
162 res1 OP1 temp0_r * pa0[k + 1]; \
163 res1 OP2 temp0_i * pa0[k]; \
164 res1 OP1 temp1_r * pa1[k + 1]; \
165 res1 OP2 temp1_i * pa1[k]; \
166 res1 OP1 temp2_r * pa2[k + 1]; \
167 res1 OP2 temp2_i * pa2[k]; \
168 res1 OP1 temp3_r * pa3[k + 1]; \
169 res1 OP2 temp3_i * pa3[k]; \
171 y[0 * inc_y2] = res0; \
172 y[0 * inc_y2 + 1] = res1; \
174 #define ZGEMV_N_4x2() \
175 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
176 LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
178 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
179 PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
180 PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
181 PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
183 y0r += tp0r * src0r; \
184 y1r += tp0r * src1r; \
185 y0r += tp1r * src2r; \
186 y1r += tp1r * src3r; \
188 y0r OP0 tp0i * src0i; \
189 y1r OP0 tp0i * src1i; \
190 y0r OP0 tp1i * src2i; \
191 y1r OP0 tp1i * src3i; \
193 y0i OP1 tp0r * src0i; \
194 y1i OP1 tp0r * src1i; \
195 y0i OP1 tp1r * src2i; \
196 y1i OP1 tp1r * src3i; \
198 y0i OP2 tp0i * src0r; \
199 y1i OP2 tp0i * src1r; \
200 y0i OP2 tp1i * src2r; \
201 y1i OP2 tp1i * src3r; \
203 #define ZGEMV_N_2x2() \
204 LD_DP2(pa0 + k, 2, t0, t1); \
205 LD_DP2(pa1 + k, 2, t4, t5); \
207 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
208 PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
210 y0r += tp0r * src0r; \
211 y0r += tp1r * src2r; \
213 y0r OP0 tp0i * src0i; \
214 y0r OP0 tp1i * src2i; \
216 y0i OP1 tp0r * src0i; \
217 y0i OP1 tp1r * src2i; \
219 y0i OP2 tp0i * src0r; \
220 y0i OP2 tp1i * src2r; \
222 #define ZGEMV_N_1x2() \
223 res0 = y[0 * inc_y2]; \
224 res1 = y[0 * inc_y2 + 1]; \
226 res0 += temp0_r * pa0[k]; \
227 res0 OP0 temp0_i * pa0[k + 1]; \
228 res0 += temp1_r * pa1[k]; \
229 res0 OP0 temp1_i * pa1[k + 1]; \
231 res1 OP1 temp0_r * pa0[k + 1]; \
232 res1 OP2 temp0_i * pa0[k]; \
233 res1 OP1 temp1_r * pa1[k + 1]; \
234 res1 OP2 temp1_i * pa1[k]; \
236 y[0 * inc_y2] = res0; \
237 y[0 * inc_y2 + 1] = res1; \
239 #define ZGEMV_N_4x1() \
240 LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
242 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
243 PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
245 y0r += tp0r * src0r; \
246 y1r += tp0r * src1r; \
248 y0r OP0 tp0i * src0i; \
249 y1r OP0 tp0i * src1i; \
251 y0i OP1 tp0r * src0i; \
252 y1i OP1 tp0r * src1i; \
254 y0i OP2 tp0i * src0r; \
255 y1i OP2 tp0i * src1r; \
257 #define ZGEMV_N_2x1() \
258 LD_DP2(pa0 + k, 2, t0, t1); \
260 PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
262 y0r += tp0r * src0r; \
263 y0r OP0 tp0i * src0i; \
264 y0i OP1 tp0r * src0i; \
265 y0i OP2 tp0i * src0r; \
267 #define ZGEMV_N_1x1() \
268 res0 = y[0 * inc_y2]; \
269 res1 = y[0 * inc_y2 + 1]; \
271 res0 += temp0_r * pa0[k]; \
272 res0 OP0 temp0_i * pa0[k + 1]; \
274 res1 OP1 temp0_r * pa0[k + 1]; \
275 res1 OP2 temp0_i * pa0[k]; \
277 y[0 * inc_y2] = res0; \
278 y[0 * inc_y2 + 1] = res1; \
280 #define ZLOAD_X4_SCALE_VECTOR() \
281 LD_DP4(x, 2, x0, x1, x2, x3); \
283 PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
284 PCKEVOD_D2_DP(x3, x2, x1r, x1i); \
286 tp4r = alphar * x0r; \
287 tp4r OP3 alphai * x0i; \
288 tp4i = alphar * x0i; \
289 tp4i OP4 alphai * x0r; \
291 tp5r = alphar * x1r; \
292 tp5r OP3 alphai * x1i; \
293 tp5i = alphar * x1i; \
294 tp5i OP4 alphai * x1r; \
296 SPLATI_D2_DP(tp4r, tp0r, tp1r); \
297 SPLATI_D2_DP(tp5r, tp2r, tp3r); \
298 SPLATI_D2_DP(tp4i, tp0i, tp1i); \
299 SPLATI_D2_DP(tp5i, tp2i, tp3i); \
301 #define ZLOAD_X2_SCALE_VECTOR() \
302 LD_DP2(x, 2, x0, x1); \
304 PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
306 tp4r = alphar * x0r; \
307 tp4r OP3 alphai * x0i; \
308 tp4i = alphar * x0i; \
309 tp4i OP4 alphai * x0r; \
311 SPLATI_D2_DP(tp4r, tp0r, tp1r); \
312 SPLATI_D2_DP(tp4i, tp0i, tp1i); \
314 #define ZLOAD_X4_SCALE_GP() \
315 x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2))); \
316 x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((BLASLONG *)(x + 1 * inc_x2))); \
317 x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 2 * inc_x2))); \
318 x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((BLASLONG *)(x + 3 * inc_x2))); \
319 x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2 + 1))); \
320 x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((BLASLONG *)(x + 1 * inc_x2 + 1))); \
321 x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 2 * inc_x2 + 1))); \
322 x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((BLASLONG *)(x + 3 * inc_x2 + 1))); \
324 tp4r = alphar * x0r; \
325 tp4r OP3 alphai * x0i; \
326 tp4i = alphar * x0i; \
327 tp4i OP4 alphai * x0r; \
329 tp5r = alphar * x1r; \
330 tp5r OP3 alphai * x1i; \
331 tp5i = alphar * x1i; \
332 tp5i OP4 alphai * x1r; \
334 SPLATI_D2_DP(tp4r, tp0r, tp1r); \
335 SPLATI_D2_DP(tp5r, tp2r, tp3r); \
336 SPLATI_D2_DP(tp4i, tp0i, tp1i); \
337 SPLATI_D2_DP(tp5i, tp2i, tp3i); \
339 #define ZLOAD_X2_SCALE_GP() \
340 x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2))); \
341 x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((BLASLONG *)(x + 1 * inc_x2))); \
342 x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(x + 0 * inc_x2 + 1))); \
343 x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((BLASLONG *)(x + 1 * inc_x2 + 1))); \
345 tp4r = alphar * x0r; \
346 tp4r OP3 alphai * x0i; \
347 tp4i = alphar * x0i; \
348 tp4i OP4 alphai * x0r; \
350 SPLATI_D2_DP(tp4r, tp0r, tp1r); \
351 SPLATI_D2_DP(tp4i, tp0i, tp1i); \
353 #define ZLOAD_X1_SCALE_GP() \
354 temp0_r = alpha_r * x[0 * inc_x2]; \
355 temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
356 temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
357 temp0_i OP4 alpha_i * x[0 * inc_x2]; \
359 tp0r = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_r); \
360 tp0i = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_i); \
362 #define ZLOAD_Y4_VECTOR() \
363 LD_DP4(y, 2, y0, y1, y2, y3); \
364 PCKEVOD_D2_DP(y1, y0, y0r, y0i); \
365 PCKEVOD_D2_DP(y3, y2, y1r, y1i); \
367 #define ZLOAD_Y2_VECTOR() \
368 LD_DP2(y, 2, y0, y1); \
369 PCKEVOD_D2_DP(y1, y0, y0r, y0i); \
371 #define ZSTORE_Y4_VECTOR() \
372 ILVRL_D2_DP(y0i, y0r, y0, y1); \
373 ILVRL_D2_DP(y1i, y1r, y2, y3); \
374 ST_DP4(y0, y1, y2, y3, y, 2); \
376 #define ZSTORE_Y2_VECTOR() \
377 ILVRL_D2_DP(y0i, y0r, y0, y1); \
378 ST_DP2(y0, y1, y, 2); \
380 #define ZLOAD_Y4_GP() \
381 y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y + 0 * inc_y2))); \
382 y0r = (v2f64) __msa_insert_d((v2i64) y0r, 1, *((BLASLONG *)(y + 1 * inc_y2))); \
383 y1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y + 2 * inc_y2))); \
384 y1r = (v2f64) __msa_insert_d((v2i64) y1r, 1, *((BLASLONG *)(y + 3 * inc_y2))); \
385 y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y + 0 * inc_y2 + 1))); \
386 y0i = (v2f64) __msa_insert_d((v2i64) y0i, 1, *((BLASLONG *)(y + 1 * inc_y2 + 1))); \
387 y1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y + 2 * inc_y2 + 1))); \
388 y1i = (v2f64) __msa_insert_d((v2i64) y1i, 1, *((BLASLONG *)(y + 3 * inc_y2 + 1))); \
390 #define ZLOAD_Y2_GP() \
391 y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y + 0 * inc_y2))); \
392 y0r = (v2f64) __msa_insert_d((v2i64) y0r, 1, *((BLASLONG *)(y + 1 * inc_y2))); \
393 y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((BLASLONG *)(y + 0 * inc_y2 + 1))); \
394 y0i = (v2f64) __msa_insert_d((v2i64) y0i, 1, *((BLASLONG *)(y + 1 * inc_y2 + 1))); \
396 #define ZSTORE_Y4_GP() \
397 *((BLASLONG *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0); \
398 *((BLASLONG *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1); \
399 *((BLASLONG *)(y + 2 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 0); \
400 *((BLASLONG *)(y + 3 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 1); \
401 *((BLASLONG *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0); \
402 *((BLASLONG *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1); \
403 *((BLASLONG *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 0); \
404 *((BLASLONG *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 1); \
406 #define ZSTORE_Y2_GP() \
407 *((BLASLONG *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0); \
408 *((BLASLONG *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1); \
409 *((BLASLONG *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0); \
410 *((BLASLONG *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1); \
412 #define ZGEMV_N_MSA() \
413 for (j = (n >> 2); j--;) \
418 k_pref = pref_offset; \
421 for (i = (m >> 2); i--;) \
423 PREFETCH(pa0 + k_pref + 8 + 0); \
424 PREFETCH(pa0 + k_pref + 8 + 4); \
425 PREFETCH(pa1 + k_pref + 8 + 0); \
426 PREFETCH(pa1 + k_pref + 8 + 4); \
427 PREFETCH(pa2 + k_pref + 8 + 0); \
428 PREFETCH(pa2 + k_pref + 8 + 4); \
429 PREFETCH(pa3 + k_pref + 8 + 0); \
430 PREFETCH(pa3 + k_pref + 8 + 4); \
483 for (i = (m >> 2); i--;) \
530 for (i = (m >> 2); i--;) \
562 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
563 FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
564 BLASLONG inc_y2, FLOAT *buffer)
566 BLASLONG i, j, k, k_pref, pref_offset;
568 FLOAT *pa0, *pa1, *pa2, *pa3;
569 FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i;
570 FLOAT temp3_i, res0, res1;
571 v2f64 alphar, alphai;
572 v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
573 v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i;
574 v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
575 v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
576 v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
577 v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i;
583 pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1);
584 pref_offset = L1_DATA_LINESIZE - pref_offset;
585 pref_offset = pref_offset / sizeof(FLOAT);
592 alphar = COPY_DOUBLE_TO_VECTOR(alpha_r);
593 alphai = COPY_DOUBLE_TO_VECTOR(alpha_i);
595 if ((2 == inc_x2) && (2 == inc_y2))
597 #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR
598 #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR
599 #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
600 #define ZLOAD_Y4 ZLOAD_Y4_VECTOR
601 #define ZLOAD_Y2 ZLOAD_Y2_VECTOR
602 #define ZSTORE_Y4 ZSTORE_Y4_VECTOR
603 #define ZSTORE_Y2 ZSTORE_Y2_VECTOR
607 #undef ZLOAD_X4_SCALE
608 #undef ZLOAD_X2_SCALE
609 #undef ZLOAD_X1_SCALE
615 else if (2 == inc_x2)
617 #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR
618 #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR
619 #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
620 #define ZLOAD_Y4 ZLOAD_Y4_GP
621 #define ZLOAD_Y2 ZLOAD_Y2_GP
622 #define ZSTORE_Y4 ZSTORE_Y4_GP
623 #define ZSTORE_Y2 ZSTORE_Y2_GP
627 #undef ZLOAD_X4_SCALE
628 #undef ZLOAD_X2_SCALE
629 #undef ZLOAD_X1_SCALE
635 else if (2 == inc_y2)
637 #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP
638 #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP
639 #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
640 #define ZLOAD_Y4 ZLOAD_Y4_VECTOR
641 #define ZLOAD_Y2 ZLOAD_Y2_VECTOR
642 #define ZSTORE_Y4 ZSTORE_Y4_VECTOR
643 #define ZSTORE_Y2 ZSTORE_Y2_VECTOR
647 #undef ZLOAD_X4_SCALE
648 #undef ZLOAD_X2_SCALE
649 #undef ZLOAD_X1_SCALE
657 #define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP
658 #define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP
659 #define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
660 #define ZLOAD_Y4 ZLOAD_Y4_GP
661 #define ZLOAD_Y2 ZLOAD_Y2_GP
662 #define ZSTORE_Y4 ZSTORE_Y4_GP
663 #define ZSTORE_Y2 ZSTORE_Y2_GP
667 #undef ZLOAD_X4_SCALE
668 #undef ZLOAD_X2_SCALE
669 #undef ZLOAD_X1_SCALE