1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
67 #define CGEMV_N_8x4() \
68 LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
69 LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
70 LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
71 LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
73 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
74 PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
75 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
76 PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
77 PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
78 PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
79 PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
80 PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
82 y0r += tp0r * src0r; \
83 y1r += tp0r * src1r; \
84 y0r += tp1r * src2r; \
85 y1r += tp1r * src3r; \
86 y0r += tp2r * src4r; \
87 y1r += tp2r * src5r; \
88 y0r += tp3r * src6r; \
89 y1r += tp3r * src7r; \
91 y0r OP0 tp0i * src0i; \
92 y1r OP0 tp0i * src1i; \
93 y0r OP0 tp1i * src2i; \
94 y1r OP0 tp1i * src3i; \
95 y0r OP0 tp2i * src4i; \
96 y1r OP0 tp2i * src5i; \
97 y0r OP0 tp3i * src6i; \
98 y1r OP0 tp3i * src7i; \
100 y0i OP1 tp0r * src0i; \
101 y1i OP1 tp0r * src1i; \
102 y0i OP1 tp1r * src2i; \
103 y1i OP1 tp1r * src3i; \
104 y0i OP1 tp2r * src4i; \
105 y1i OP1 tp2r * src5i; \
106 y0i OP1 tp3r * src6i; \
107 y1i OP1 tp3r * src7i; \
109 y0i OP2 tp0i * src0r; \
110 y1i OP2 tp0i * src1r; \
111 y0i OP2 tp1i * src2r; \
112 y1i OP2 tp1i * src3r; \
113 y0i OP2 tp2i * src4r; \
114 y1i OP2 tp2i * src5r; \
115 y0i OP2 tp3i * src6r; \
116 y1i OP2 tp3i * src7r; \
118 #define CGEMV_N_4x4() \
119 LD_SP2(pa0 + k, 4, t0, t1); \
120 LD_SP2(pa1 + k, 4, t4, t5); \
121 LD_SP2(pa2 + k, 4, t8, t9); \
122 LD_SP2(pa3 + k, 4, t12, t13); \
124 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
125 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
126 PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
127 PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
129 y0r += tp0r * src0r; \
130 y0r += tp1r * src2r; \
131 y0r += tp2r * src4r; \
132 y0r += tp3r * src6r; \
134 y0r OP0 tp0i * src0i; \
135 y0r OP0 tp1i * src2i; \
136 y0r OP0 tp2i * src4i; \
137 y0r OP0 tp3i * src6i; \
139 y0i OP1 tp0r * src0i; \
140 y0i OP1 tp1r * src2i; \
141 y0i OP1 tp2r * src4i; \
142 y0i OP1 tp3r * src6i; \
144 y0i OP2 tp0i * src0r; \
145 y0i OP2 tp1i * src2r; \
146 y0i OP2 tp2i * src4r; \
147 y0i OP2 tp3i * src6r; \
149 #define CGEMV_N_1x4() \
150 res0 = y[0 * inc_y2]; \
151 res1 = y[0 * inc_y2 + 1]; \
153 res0 += temp0_r * pa0[k]; \
154 res0 OP0 temp0_i * pa0[k + 1]; \
155 res0 += temp1_r * pa1[k]; \
156 res0 OP0 temp1_i * pa1[k + 1]; \
157 res0 += temp2_r * pa2[k]; \
158 res0 OP0 temp2_i * pa2[k + 1]; \
159 res0 += temp3_r * pa3[k]; \
160 res0 OP0 temp3_i * pa3[k + 1]; \
162 res1 OP1 temp0_r * pa0[k + 1]; \
163 res1 OP2 temp0_i * pa0[k]; \
164 res1 OP1 temp1_r * pa1[k + 1]; \
165 res1 OP2 temp1_i * pa1[k]; \
166 res1 OP1 temp2_r * pa2[k + 1]; \
167 res1 OP2 temp2_i * pa2[k]; \
168 res1 OP1 temp3_r * pa3[k + 1]; \
169 res1 OP2 temp3_i * pa3[k]; \
171 y[0 * inc_y2] = res0; \
172 y[0 * inc_y2 + 1] = res1; \
174 #define CGEMV_N_8x2() \
175 LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
176 LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
178 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
179 PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
180 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
181 PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
183 y0r += tp0r * src0r; \
184 y1r += tp0r * src1r; \
185 y0r += tp1r * src2r; \
186 y1r += tp1r * src3r; \
188 y0r OP0 tp0i * src0i; \
189 y1r OP0 tp0i * src1i; \
190 y0r OP0 tp1i * src2i; \
191 y1r OP0 tp1i * src3i; \
193 y0i OP1 tp0r * src0i; \
194 y1i OP1 tp0r * src1i; \
195 y0i OP1 tp1r * src2i; \
196 y1i OP1 tp1r * src3i; \
198 y0i OP2 tp0i * src0r; \
199 y1i OP2 tp0i * src1r; \
200 y0i OP2 tp1i * src2r; \
201 y1i OP2 tp1i * src3r; \
203 #define CGEMV_N_4x2() \
204 LD_SP2(pa0 + k, 4, t0, t1); \
205 LD_SP2(pa1 + k, 4, t4, t5); \
207 PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
208 PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
210 y0r += tp0r * src0r; \
211 y0r += tp1r * src2r; \
213 y0r OP0 tp0i * src0i; \
214 y0r OP0 tp1i * src2i; \
216 y0i OP1 tp0r * src0i; \
217 y0i OP1 tp1r * src2i; \
219 y0i OP2 tp0i * src0r; \
220 y0i OP2 tp1i * src2r; \
222 #define CGEMV_N_1x2() \
223 res0 = y[0 * inc_y2]; \
224 res1 = y[0 * inc_y2 + 1]; \
226 res0 += temp0_r * pa0[k]; \
227 res0 OP0 temp0_i * pa0[k + 1]; \
228 res0 += temp1_r * pa1[k]; \
229 res0 OP0 temp1_i * pa1[k + 1]; \
231 res1 OP1 temp0_r * pa0[k + 1]; \
232 res1 OP2 temp0_i * pa0[k]; \
233 res1 OP1 temp1_r * pa1[k + 1]; \
234 res1 OP2 temp1_i * pa1[k]; \
236 y[0 * inc_y2] = res0; \
237 y[0 * inc_y2 + 1] = res1; \
239 #define CGEMV_N_1x1() \
240 res0 = y[0 * inc_y2]; \
241 res1 = y[0 * inc_y2 + 1]; \
243 res0 += temp_r * pa0[k]; \
244 res0 OP0 temp_i * pa0[k + 1]; \
246 res1 OP1 temp_r * pa0[k + 1]; \
247 res1 OP2 temp_i * pa0[k]; \
249 y[0 * inc_y2] = res0; \
250 y[0 * inc_y2 + 1] = res1; \
252 #define CLOAD_X4_SCALE_VECTOR() \
253 LD_SP2(x, 4, x0, x1); \
255 PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
257 tp4r = alphar * x0r; \
258 tp4r OP3 alphai * x0i; \
259 tp4i = alphar * x0i; \
260 tp4i OP4 alphai * x0r; \
262 SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
263 SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
265 #define CLOAD_X4_SCALE_GP() \
266 x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
267 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
268 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
269 x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
270 x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
271 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
272 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
273 x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
275 tp4r = alphar * x0r; \
276 tp4r OP3 alphai * x0i; \
277 tp4i = alphar * x0i; \
278 tp4i OP4 alphai * x0r; \
280 SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
281 SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
283 #define CLOAD_X2_SCALE_GP() \
284 temp0_r = alpha_r * x[0 * inc_x2]; \
285 temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
286 temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
287 temp0_i OP4 alpha_i * x[0 * inc_x2]; \
289 temp1_r = alpha_r * x[1 * inc_x2]; \
290 temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \
291 temp1_i = alpha_r * x[1 * inc_x2 + 1]; \
292 temp1_i OP4 alpha_i * x[1 * inc_x2]; \
294 tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \
295 tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \
296 tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \
297 tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \
299 #define CLOAD_X1_SCALE_GP() \
300 temp_r = alpha_r * x[0 * inc_x2]; \
301 temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
302 temp_i = alpha_r * x[0 * inc_x2 + 1]; \
303 temp_i OP4 alpha_i * x[0 * inc_x2]; \
305 #define CLOAD_Y8_VECTOR() \
306 LD_SP4(y, 4, y0, y1, y2, y3); \
307 PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
308 PCKEVOD_W2_SP(y3, y2, y1r, y1i); \
310 #define CLOAD_Y4_VECTOR() \
311 LD_SP2(y, 4, y0, y1); \
312 PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
314 #define CSTORE_Y8_VECTOR() \
315 ILVRL_W2_SP(y0i, y0r, y0, y1); \
316 ILVRL_W2_SP(y1i, y1r, y2, y3); \
317 ST_SP4(y0, y1, y2, y3, y, 4); \
319 #define CSTORE_Y4_VECTOR() \
320 ILVRL_W2_SP(y0i, y0r, y0, y1); \
321 ST_SP2(y0, y1, y, 4); \
323 #define CLOAD_Y8_GP() \
324 y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
325 y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
326 y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
327 y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
328 y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \
329 y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \
330 y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \
331 y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \
332 y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
333 y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
334 y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
335 y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
336 y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \
337 y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \
338 y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \
339 y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \
341 #define CLOAD_Y4_GP() \
342 y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
343 y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
344 y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
345 y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
346 y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
347 y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
348 y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
349 y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
351 #define CSTORE_Y8_GP() \
352 *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
353 *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
354 *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
355 *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
356 *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \
357 *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \
358 *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \
359 *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \
360 *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
361 *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
362 *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
363 *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
364 *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \
365 *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \
366 *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \
367 *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \
369 #define CSTORE_Y4_GP() \
370 *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
371 *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
372 *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
373 *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
374 *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
375 *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
376 *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
377 *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
379 #define CGEMV_N_MSA() \
380 for (j = (n >> 2); j--;) \
385 k_pref = pref_offset; \
388 for (i = (m >> 3); i--;) \
390 PREFETCH(pa0 + k_pref + 16 + 0); \
391 PREFETCH(pa0 + k_pref + 16 + 8); \
392 PREFETCH(pa1 + k_pref + 16 + 0); \
393 PREFETCH(pa1 + k_pref + 16 + 8); \
394 PREFETCH(pa2 + k_pref + 16 + 0); \
395 PREFETCH(pa2 + k_pref + 16 + 8); \
396 PREFETCH(pa3 + k_pref + 16 + 0); \
397 PREFETCH(pa3 + k_pref + 16 + 8); \
430 for (i = (m & 3); i--;) \
454 for (i = (m >> 3); i--;) \
474 for (i = (m & 3); i--;) \
507 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
508 FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
509 BLASLONG inc_y2, FLOAT *buffer)
511 BLASLONG i, j, k, k_pref, pref_offset;
513 FLOAT *pa0, *pa1, *pa2, *pa3;
514 FLOAT temp_r, temp_i, res0, res1, temp0_r;
515 FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
516 v4f32 alphar, alphai;
517 v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
518 v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
519 v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
520 v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
521 v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
527 pref_offset = (uintptr_t)A & (L1_DATA_LINESIZE - 1);
528 pref_offset = L1_DATA_LINESIZE - pref_offset;
529 pref_offset = pref_offset / sizeof(FLOAT);
536 alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
537 alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
539 if ((2 == inc_x2) && (2 == inc_y2))
541 #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
542 #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
543 #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
544 #define CLOAD_Y8 CLOAD_Y8_VECTOR
545 #define CLOAD_Y4 CLOAD_Y4_VECTOR
546 #define CSTORE_Y8 CSTORE_Y8_VECTOR
547 #define CSTORE_Y4 CSTORE_Y4_VECTOR
551 #undef CLOAD_X4_SCALE
552 #undef CLOAD_X2_SCALE
553 #undef CLOAD_X1_SCALE
559 else if (2 == inc_x2)
561 #define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
562 #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
563 #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
564 #define CLOAD_Y8 CLOAD_Y8_GP
565 #define CLOAD_Y4 CLOAD_Y4_GP
566 #define CSTORE_Y8 CSTORE_Y8_GP
567 #define CSTORE_Y4 CSTORE_Y4_GP
571 #undef CLOAD_X4_SCALE
572 #undef CLOAD_X2_SCALE
573 #undef CLOAD_X1_SCALE
579 else if (2 == inc_y2)
581 #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
582 #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
583 #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
584 #define CLOAD_Y8 CLOAD_Y8_VECTOR
585 #define CLOAD_Y4 CLOAD_Y4_VECTOR
586 #define CSTORE_Y8 CSTORE_Y8_VECTOR
587 #define CSTORE_Y4 CSTORE_Y4_VECTOR
591 #undef CLOAD_X4_SCALE
592 #undef CLOAD_X2_SCALE
593 #undef CLOAD_X1_SCALE
601 #define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
602 #define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
603 #define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
604 #define CLOAD_Y8 CLOAD_Y8_GP
605 #define CLOAD_Y4 CLOAD_Y4_GP
606 #define CSTORE_Y8 CSTORE_Y8_GP
607 #define CSTORE_Y4 CSTORE_Y4_GP
611 #undef CLOAD_X4_SCALE
612 #undef CLOAD_X2_SCALE
613 #undef CLOAD_X1_SCALE