1 /***************************************************************************
2 Copyright (c) 2013-2020, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
29 #define DISP32(ind, disp) (ind*unit_size*32+disp)
30 #define DISP16(ind, disp) (ind*unit_size*16+disp)
31 #define DISP8(ind, disp) (ind*unit_size*8+disp)
32 #define DISP4(ind, disp) (ind*unit_size*4+disp)
33 #define DISP2(ind, disp) (ind*unit_size*2+disp)
34 #define DISP1(ind, disp) (ind*unit_size+disp)
35 #define DISPX(disp) (disp)
37 .macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
38 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
39 xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
40 xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
41 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
42 xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
43 xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
44 #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
45 xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
46 xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
47 #else // CC || CR || RC || RR
48 /*we will assume {-alpha_r,-alpha_i} for this case */
49 /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
50 xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
51 /*we will negate alpha image instead to fix sign*/
52 xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
56 .macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
57 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
58 xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
59 xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
60 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
61 xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
62 xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
63 #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
64 xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
65 xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
66 #else // CC || CR || RC || RR
67 /*we will assume {-alpha_r,-alpha_i} for this case */
68 /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
69 xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
70 /*we will negate alpha image instead to fix sign*/
71 xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
75 /* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
77 .macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2
78 xvmulsp \VSOUT1, \VSINII, alpha_i
79 xvmulsp \VSOUT2, \VSINRR, alpha_i
82 /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
84 .macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2
85 xvmsubasp \VSOUT1, \VSINRR, alpha_r
86 xvmaddasp \VSOUT2, \VSINII, alpha_r
89 .macro PERMUTE1 OUT, R1, R2, R3, R4
90 xxsel vs62, \R1, \R2, vs57
91 xxsel \OUT, \R3, \R4, vs57
92 xxpermdi \OUT, \OUT, vs62, 1
94 .macro PERMUTE2 OUT, R1, R2, R3, R4
95 xxsel vs62, \R2, \R1, vs57
96 xxsel \OUT, \R4, \R3, vs57
97 xxpermdi \OUT, vs62, \OUT, 1
98 xxperm \OUT, \OUT, permute_mask
100 .macro PERMUTE3 OUT, R1, R2, R3, R4
101 xxsel vs62, \R1, \R2, vs57
102 xxsel \OUT, \R3, \R4, vs57
103 xxpermdi \OUT, vs62, \OUT, 2
105 .macro PERMUTE4 OUT, R1, R2, R3, R4
106 xxsel vs62, \R2, \R1, vs57
107 xxsel \OUT, \R4, \R3, vs57
108 xxpermdi \OUT, \OUT, vs62, 2
109 xxperm \OUT, \OUT, permute_mask
112 xxperm vs0, vs32, permute_mask
113 xxperm vs4, vs40, permute_mask
114 xxperm vs1, vs33, permute_mask
115 xxperm vs5, vs41, permute_mask
116 xxperm vs8, vs36, permute_mask
117 xxperm vs12, vs44, permute_mask
118 xxperm vs9, vs37, permute_mask
119 xxperm vs13, vs45, permute_mask
122 AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
123 AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
124 AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12
125 AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13
128 xxperm vs0, vs34, permute_mask
129 xxperm vs4, vs42, permute_mask
130 xxperm vs1, vs35, permute_mask
131 xxperm vs5, vs43, permute_mask
132 xxperm vs8, vs38, permute_mask
133 xxperm vs12, vs46, permute_mask
134 xxperm vs9, vs39, permute_mask
135 xxperm vs13, vs47, permute_mask
138 AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4
139 AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5
140 AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12
141 AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13
143 .macro MULTIPLY_GROUP1
144 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
145 MULT_APLHA_PART1 vs33, vs41, vs2, vs3
146 MULT_APLHA_PART1 vs36, vs44, vs8, vs9
147 MULT_APLHA_PART1 vs37, vs45, vs10, vs11
148 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
149 MULT_APLHA_PART2 vs33, vs41, vs2, vs3
150 MULT_APLHA_PART2 vs36, vs44, vs8, vs9
151 MULT_APLHA_PART2 vs37, vs45, vs10, vs11
153 .macro MULTIPLY_GROUP2
154 MULT_APLHA_PART1 vs34, vs42, vs4, vs5
155 MULT_APLHA_PART1 vs35, vs43, vs6, vs7
156 MULT_APLHA_PART1 vs38, vs46, vs12, vs13
157 MULT_APLHA_PART1 vs39, vs47, vs14, vs15
158 MULT_APLHA_PART2 vs34, vs42, vs4, vs5
159 MULT_APLHA_PART2 vs35, vs43, vs6, vs7
160 MULT_APLHA_PART2 vs38, vs46, vs12, vs13
161 MULT_APLHA_PART2 vs39, vs47, vs14, vs15
163 /* reconstruct r, i pairs*/
164 .macro RECONSTRUCT_PAIR1
165 xxperm vs0, vs1, save_permute_1
166 xxperm vs2, vs3, save_permute_1
167 xxperm vs8, vs9, save_permute_1
168 xxperm vs10, vs11, save_permute_1
170 .macro RECONSTRUCT_PAIR2
171 xxperm vs4, vs5, save_permute_1
172 xxperm vs6, vs7, save_permute_1
173 xxperm vs12, vs13, save_permute_1
174 xxperm vs14, vs15, save_permute_1
176 .macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4
178 PERMUTE1 \O1, \R3, \R2, \R1, \R0
179 PERMUTE2 \O2, \R1, \R0, \R3, \R2
180 PERMUTE3 \O3, \R1, \R0, \R3, \R2
181 PERMUTE4 \O4, \R3, \R2, \R1, \R0
183 /* macros for N=4 and M=8
184 **********************************************************************************************/
200 .macro LOAD4x8O OffsetA, OffsetB
201 lxvp vs34, (\OffsetB+0)(BO)
202 lxvp vs32, (\OffsetA+0)(AO)
203 lxvp vs36, (\OffsetA+32)(AO)
207 END4x8 AO, BO, 64, 32
210 .macro END4x8_WITHOUT_ADD
214 .macro END4x8 AREG, BREG, OffsetA, OffsetB
216 addi \BREG, \BREG, \OffsetB
219 addi \AREG, \AREG, \OffsetA
235 .macro LOAD4x8_2O OffsetA, OffsetB
236 lxvp vs34, (\OffsetB)(BO)
237 lxvp vs38, (32+\OffsetB)(BO)
238 lxvp vs32, (0+\OffsetA)(AO)
239 lxvp vs36, (32+\OffsetA)(AO)
240 lxvp vs40, (64+\OffsetA)(AO)
241 lxvp vs42, (64+32+\OffsetA)(AO)
245 /*for load2 offset will be 128 and 64*/
246 KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1
249 .macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast
250 KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
253 .macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast
254 KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
257 .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
267 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
268 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
269 lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
280 lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
281 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
282 lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
286 addi \BREG, \BREG, DISP8(\Index, \OffsetB)
287 addi \AREG, \AREG, DISP16(\Index, \OffsetA)
289 addi \BREG, \BREG, DISP8(\Index, 64)
290 addi \AREG, \AREG, DISP16(\Index, 128)
297 END4x8 AO, BO, 64, 32
301 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
302 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
303 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
304 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
305 SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
306 SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
307 SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
308 SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
320 xxperm vs2, vs34, permute_mask
321 xxperm vs6, vs42, permute_mask
325 xxperm vs3, vs35, permute_mask
326 xxperm vs7, vs43, permute_mask
331 AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
332 xxperm vs10, vs38, permute_mask
333 xxperm vs14, vs46, permute_mask
334 AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
335 xxperm vs11, vs39, permute_mask
336 xxperm vs15, vs47, permute_mask
337 xxperm vs0, vs48, permute_mask
338 xxperm vs4, vs56, permute_mask
339 xxperm vs1, vs49, permute_mask
340 xxperm vs5, vs16, permute_mask
341 AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14
342 xxperm vs2, vs50, permute_mask
343 xxperm vs6, vs58, permute_mask
344 AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15
345 xxperm vs3, vs17, permute_mask
346 xxperm vs7, vs19, permute_mask
347 AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4
348 xxperm vs8, vs52, permute_mask
349 xxperm vs12, vs60, permute_mask
350 AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5
351 xxperm vs9, vs53, permute_mask
352 xxperm vs13, vs61, permute_mask
353 AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6
354 xxperm vs10, vs54, permute_mask
355 xxperm vs14, vs21, permute_mask
356 AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7
357 xxperm vs11, vs18, permute_mask
358 xxperm vs15, vs20, permute_mask
359 AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12
360 AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13
361 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
362 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
363 AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14
364 MULT_APLHA_PART1 vs33, vs41, vs2, vs3
365 AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15
366 MULT_APLHA_PART1 vs34, vs42, vs4, vs5
367 MULT_APLHA_PART1 vs35, vs43, vs6, vs7
368 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
369 MULT_APLHA_PART2 vs33, vs41, vs2, vs3
370 MULT_APLHA_PART2 vs34, vs42, vs4, vs5
371 MULT_APLHA_PART2 vs35, vs43, vs6, vs7
375 MULT_APLHA_PART1 vs36, vs44, vs8, vs9
376 MULT_APLHA_PART1 vs37, vs45, vs10, vs11
380 MULT_APLHA_PART1 vs38, vs46, vs12, vs13
381 MULT_APLHA_PART1 vs39, vs47, vs14, vs15
385 MULT_APLHA_PART2 vs36, vs44, vs8, vs9
386 MULT_APLHA_PART2 vs37, vs45, vs10, vs11
390 MULT_APLHA_PART2 vs38, vs46, vs12, vs13
391 MULT_APLHA_PART2 vs39, vs47, vs14, vs15
396 xxpermdi vs1, vs8, vs0, 2
397 xxpermdi vs3, vs10, vs2, 2
398 xxpermdi vs5, vs12, vs4, 2
399 xxpermdi vs7, vs14, vs6, 2
400 xxpermdi vs9, vs0, vs8, 2
401 xxpermdi vs11, vs2, vs10, 2
402 xvaddsp vs24, vs24, vs3
403 xvaddsp vs25, vs25, vs1
404 xxpermdi vs13, vs4, vs12, 2
405 xxpermdi vs15, vs6, vs14, 2
406 xvaddsp vs26, vs26, vs7
407 xvaddsp vs27, vs27, vs5
408 xvaddsp vs28, vs28, vs11
409 xvaddsp vs29, vs29, vs9
410 xvaddsp vs30, vs30, vs15
411 xvaddsp vs31, vs31, vs13
413 xxpermdi vs25, vs8, vs0, 2
414 xxpermdi vs24, vs10, vs2, 2
415 xxpermdi vs27, vs12, vs4, 2
416 xxpermdi vs26, vs14, vs6, 2
417 xxpermdi vs29, vs0, vs8, 2
418 xxpermdi vs28, vs2, vs10, 2
419 xxpermdi vs31, vs4, vs12, 2
420 xxpermdi vs30, vs6, vs14, 2
423 MULT_APLHA_PART1 vs48, vs56, vs0, vs1
424 MULT_APLHA_PART1 vs49, vs16, vs2, vs3
426 MULT_APLHA_PART1 vs50, vs58, vs4, vs5
427 MULT_APLHA_PART1 vs17, vs19, vs6, vs7
429 MULT_APLHA_PART2 vs48, vs56, vs0, vs1
430 MULT_APLHA_PART2 vs49, vs16, vs2, vs3
432 MULT_APLHA_PART2 vs50, vs58, vs4, vs5
433 MULT_APLHA_PART2 vs17, vs19, vs6, vs7
434 MULT_APLHA_PART1 vs52, vs60, vs8, vs9
435 MULT_APLHA_PART1 vs53, vs61, vs10, vs11
436 MULT_APLHA_PART1 vs54, vs21, vs12, vs13
437 MULT_APLHA_PART1 vs18, vs20, vs14, vs15
438 MULT_APLHA_PART2 vs52, vs60, vs8, vs9
439 MULT_APLHA_PART2 vs53, vs61, vs10, vs11
440 MULT_APLHA_PART2 vs54, vs21, vs12, vs13
441 MULT_APLHA_PART2 vs18, vs20, vs14, vs15
446 xxpermdi vs1, vs8, vs0, 2
447 xxpermdi vs3, vs10, vs2, 2
448 xxpermdi vs5, vs12, vs4, 2
449 xxpermdi vs7, vs14, vs6, 2
450 xxpermdi vs9, vs0, vs8, 2
451 xxpermdi vs11, vs2, vs10, 2
452 xvaddsp vs32, vs32, vs3
453 xvaddsp vs33, vs33, vs1
454 xxpermdi vs13, vs4, vs12, 2
455 xxpermdi vs15, vs6, vs14, 2
456 xvaddsp vs40, vs40, vs7
457 xvaddsp vs41, vs41, vs5
458 xvaddsp vs34, vs34, vs11
459 xvaddsp vs35, vs35, vs9
460 xvaddsp vs42, vs42, vs15
461 xvaddsp vs43, vs43, vs13
463 xxpermdi vs33, vs8, vs0, 2
464 xxpermdi vs32, vs10, vs2, 2
465 xxpermdi vs41, vs12, vs4, 2
466 xxpermdi vs40, vs14, vs6, 2
467 xxpermdi vs35, vs0, vs8, 2
468 xxpermdi vs34, vs2, vs10, 2
469 xxpermdi vs43, vs4, vs12, 2
470 xxpermdi vs42, vs6, vs14, 2
479 /* macros for N=4 and M=4
480 **********************************************************************************************/
493 .macro LOAD4x4O OffsetA, OffsetB
494 lxvp vs34, (\OffsetB+0)(BO)
495 lxvp vs32, (\OffsetA+0)(AO)
499 END4x4 AO, BO, 32, 32
502 .macro END4x4_WITHOUT_ADD
506 .macro END4x4 AREG, BREG, OffsetA, OffsetB
508 addi \BREG, \BREG, \OffsetB
511 addi \AREG, \AREG, \OffsetA
523 .macro LOAD4x4_2O OffsetA, OffsetB
524 lxvp vs34, (\OffsetB)(BO)
525 lxvp vs38, (32+\OffsetB)(BO)
526 lxvp vs32, (0+\OffsetA)(AO)
527 lxvp vs36, (32+\OffsetA)(AO)
531 /*for load2 offset will be 64 and 64*/
532 KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1
535 .macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast
536 KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
539 .macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast
540 KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
543 .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
549 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
550 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
557 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
558 lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
562 addi \BREG, \BREG, DISP8(\Index, \OffsetB)
563 addi \AREG, \AREG, DISP8(\Index, \OffsetA)
565 addi \BREG, \BREG, DISP8(\Index, 64)
566 addi \AREG, \AREG, DISP8(\Index, 64)
573 END4x4 AO, BO, 32, 32
577 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
578 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
579 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
580 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
601 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
604 /* reconstruct r, i pairs*/
609 xxpermdi vs1, vs8, vs0, 2
610 xxpermdi vs3, vs10, vs2, 2
611 xxpermdi vs9, vs0, vs8, 2
612 xxpermdi vs11, vs2, vs10, 2
613 xxpermdi vs5, vs12, vs4, 2
614 xxpermdi vs7, vs14, vs6, 2
615 xxpermdi vs13, vs4, vs12, 2
616 xxpermdi vs15, vs6, vs14, 2
617 xvaddsp vs24, vs24, vs3
618 xvaddsp vs25, vs25, vs1
619 xvaddsp vs26, vs26, vs11
620 xvaddsp vs27, vs27, vs9
621 xvaddsp vs28, vs28, vs7
622 xvaddsp vs29, vs29, vs5
623 xvaddsp vs30, vs30, vs15
624 xvaddsp vs31, vs31, vs13
626 xxpermdi vs25, vs8, vs0, 2
627 xxpermdi vs24, vs10, vs2, 2
628 xxpermdi vs27, vs0, vs8, 2
629 xxpermdi vs26, vs2, vs10, 2
630 xxpermdi vs29, vs12, vs4, 2
631 xxpermdi vs28, vs14, vs6, 2
632 xxpermdi vs31, vs4, vs12, 2
633 xxpermdi vs30, vs6, vs14, 2
642 /* macros for N=4 and M=2
643 **********************************************************************************************/
654 .macro LOAD4x2O OffsetA, OffsetB
655 lxv vs32, (\OffsetA+0)(AO)
656 lxvp vs34, (\OffsetB+0)(BO)
660 END4x2 AO, BO, 16, 32
663 .macro END4x2_WITHOUT_ADD
667 .macro END4x2 AREG, BREG, OffsetA, OffsetB
669 addi \BREG, \BREG, \OffsetB
673 addi \AREG, \AREG, \OffsetA
683 .macro LOAD4x2_2O OffsetA, OffsetB
684 lxvp vs32, (\OffsetA)(AO)
685 lxvp vs34, (0+\OffsetB)(BO)
686 lxvp vs36, (32+\OffsetB)(BO)
690 /*for load2 offset will be 32 and 64*/
691 KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1
694 .macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast
695 KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
698 .macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast
699 KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
702 .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
706 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
711 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
712 lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
716 addi \AREG, \AREG, DISP4(\Index, \OffsetA)
717 addi \BREG, \BREG, DISP8(\Index, \OffsetB)
719 addi \AREG, \AREG, DISP4(\Index, 32)
720 addi \BREG, \BREG, DISP8(\Index, 64)
727 END4x2 AO, BO, 16, 32
731 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
732 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
750 AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
751 AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
752 AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
753 AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
754 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
756 /* reconstruct r, i pairs*/
760 xxpermdi vs1, vs8, vs0, 0
761 xxpermdi vs9, vs10, vs2, 0
762 xxpermdi vs3, vs0, vs8, 3
763 xxpermdi vs11, vs2, vs10, 3
764 xvaddsp vs24, vs24, vs1
765 xvaddsp vs26, vs26, vs9
766 xvaddsp vs25, vs25, vs3
767 xvaddsp vs27, vs27, vs11
769 xxpermdi vs24, vs8, vs0, 0
770 xxpermdi vs26, vs10, vs2, 0
771 xxpermdi vs25, vs0, vs8, 3
772 xxpermdi vs27, vs2, vs10, 3
781 /* macros for N=4 and M=2
782 **********************************************************************************************/
793 .macro LOAD4x1O OffsetA, OffsetB
794 lxsd v0, (\OffsetA+0)(AO)
795 lxvp vs34, (\OffsetB+0)(BO)
802 .macro END4x1_WITHOUT_ADD
806 .macro END4x1 AREG, BREG, OffsetA, OffsetB
808 addi \BREG, \BREG, \OffsetB
812 addi \AREG, \AREG, \OffsetA
822 .macro LOAD4x1_2O OffsetA, OffsetB
823 lxv vs32, (\OffsetA)(AO)
825 xxpermdi vs33, vs32, vs38, 0
826 xxpermdi vs32, vs32, vs38, 2
827 lxvp vs34, (0+\OffsetB)(BO)
828 lxvp vs36, (32+\OffsetB)(BO)
832 /*for load2 offset will be 16 and 64*/
833 KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1
836 .macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
837 KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
840 .macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
841 KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
844 .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
848 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
853 lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
854 lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
855 xxpermdi vs33, vs32, vs38, 0
856 xxpermdi vs32, vs32, vs38, 2
860 addi \AREG, \AREG, DISP2(\Index, \OffsetA)
861 addi \BREG, \BREG, DISP8(\Index, \OffsetB)
863 addi \AREG, \AREG, DISP2(\Index, 16)
864 addi \BREG, \BREG, DISP8(\Index, 64)
875 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
876 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
877 xxpermdi vs32, vs32, vs36, 1
878 xxpermdi vs40, vs40, vs44, 1
879 xxpermdi vs33, vs33, vs37, 1
880 xxpermdi vs41, vs41, vs45, 1
897 xxperm vs0, vs32, permute_mask
898 xxperm vs4, vs40, permute_mask
899 xxperm vs1, vs33, permute_mask
900 xxperm vs5, vs41, permute_mask
901 AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
902 AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
903 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
904 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
905 MULT_APLHA_PART1 vs33, vs41, vs2, vs3
906 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
907 MULT_APLHA_PART2 vs33, vs41, vs2, vs3
908 /* reconstruct r, i pairs*/
909 xxperm vs0, vs1, save_permute_1
910 xxperm vs2, vs3, save_permute_1
917 /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
918 xvaddsp vs36, vs36, vs1
919 xvaddsp vs37, vs37, vs3
920 xvaddsp vs38, vs38, vs9
921 xvaddsp vs39, vs39, vs11
923 /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
936 /* macros for N=2 and M=8
937 **********************************************************************************************/
950 .macro LOAD2x8O OffsetA, OffsetB
951 lxv vs34, (\OffsetB+0)(BO)
952 lxvp vs32, (\OffsetA+0)(AO)
953 lxvp vs36, (\OffsetA+32)(AO)
957 END2x8 AO, BO, 64, 16
960 .macro END2x8_WITHOUT_ADD
964 .macro END2x8 AREG, BREG, OffsetA, OffsetB
966 addi \BREG, \BREG, \OffsetB
970 addi \AREG, \AREG, \OffsetA
982 .macro LOAD2x8_2O OffsetA, OffsetB
983 lxvp vs34, (\OffsetB)(BO)
984 lxvp vs32, (0+\OffsetA)(AO)
985 lxvp vs36, (32+\OffsetA)(AO)
986 lxvp vs38, (64+\OffsetA)(AO)
987 lxvp vs40, (64+32+\OffsetA)(AO)
991 /*for load2 offset will be 128 and 32*/
992 KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1
995 .macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
996 KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
999 .macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
1000 KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1003 .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1004 xvf32gerpp 2, 37, 35
1005 xvf32gerpp 3, 36, 35
1006 xvf32gerpp 0, 33, 35
1007 xvf32gerpp 1, 32, 35
1010 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
1011 lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
1013 xvf32gerpp 2, 41, 34
1014 xvf32gerpp 3, 40, 34
1015 xvf32gerpp 0, 39, 34
1016 xvf32gerpp 1, 38, 34
1019 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
1020 lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
1021 lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
1025 addi \BREG, \BREG, DISP4(\Index, \OffsetB)
1026 addi \AREG, \AREG, DISP16(\Index, \OffsetA)
1028 addi \BREG, \BREG, DISP4(\Index, 32)
1029 addi \AREG, \AREG, DISP16(\Index, 128)
1036 END2x8 AO, BO, 64, 16
1040 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
1041 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
1042 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
1043 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
1063 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1066 /* reconstruct r, i pairs*/
1071 xxpermdi vs1, vs8, vs0, 2
1072 xxpermdi vs3, vs10, vs2, 2
1073 xxpermdi vs5, vs12, vs4, 2
1074 xxpermdi vs7, vs14, vs6, 2
1075 xxpermdi vs9, vs0, vs8, 2
1076 xxpermdi vs11, vs2, vs10, 2
1077 xvaddsp vs24, vs24, vs3
1078 xvaddsp vs25, vs25, vs1
1079 xxpermdi vs13, vs4, vs12, 2
1080 xxpermdi vs15, vs6, vs14, 2
1081 xvaddsp vs26, vs26, vs7
1082 xvaddsp vs27, vs27, vs5
1083 xvaddsp vs28, vs28, vs11
1084 xvaddsp vs29, vs29, vs9
1085 xvaddsp vs30, vs30, vs15
1086 xvaddsp vs31, vs31, vs13
1088 xxpermdi vs25, vs8, vs0, 2
1089 xxpermdi vs24, vs10, vs2, 2
1090 xxpermdi vs27, vs12, vs4, 2
1091 xxpermdi vs26, vs14, vs6, 2
1092 xxpermdi vs29, vs0, vs8, 2
1093 xxpermdi vs28, vs2, vs10, 2
1094 xxpermdi vs31, vs4, vs12, 2
1095 xxpermdi vs30, vs6, vs14, 2
1104 /* macros for N=2 and M=4
1105 **********************************************************************************************/
1116 .macro LOAD2x4O OffsetA, OffsetB
1117 lxv vs34, (\OffsetB+0)(BO)
1118 lxvp vs32, (\OffsetA+0)(AO)
1121 .macro END2x4_NORMAL
1122 END2x4 AO, BO, 32, 16
1125 .macro END2x4_WITHOUT_ADD
1129 .macro END2x4 AREG, BREG, OffsetA, OffsetB
1131 addi \BREG, \BREG, \OffsetB
1134 addi \AREG, \AREG, \OffsetA
1136 xvf32gerpp 0, 33, 34
1137 xvf32gerpp 1, 32, 34
1144 .macro LOAD2x4_2O OffsetA, OffsetB
1145 lxvp vs34, (\OffsetB)(BO)
1146 lxvp vs32, (0+\OffsetA)(AO)
1147 lxvp vs36, (32+\OffsetA)(AO)
1151 /*for load2 offset will be 64 and 32*/
1152 KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1
1155 .macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
1156 KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1159 .macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
1160 KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1163 .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1164 xvf32gerpp 0, 33, 35
1165 xvf32gerpp 1, 32, 35
1167 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
1169 xvf32gerpp 0, 37, 34
1170 xvf32gerpp 1, 36, 34
1172 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
1173 lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
1177 addi \BREG, \BREG, DISP4(\Index, \OffsetB)
1178 addi \AREG, \AREG, DISP8(\Index, \OffsetA)
1180 addi \BREG, \BREG, DISP4(\Index, 32)
1181 addi \AREG, \AREG, DISP8(\Index, 64)
1188 END2x4 AO, BO, 32, 16
1192 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
1193 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
1203 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1205 /* reconstruct r, i pairs*/
1209 xxpermdi vs1, vs8, vs0, 2
1210 xxpermdi vs3, vs10, vs2, 2
1211 xxpermdi vs9, vs0, vs8, 2
1212 xxpermdi vs11, vs2, vs10, 2
1213 xvaddsp vs24, vs24, vs3
1214 xvaddsp vs25, vs25, vs1
1215 xvaddsp vs26, vs26, vs11
1216 xvaddsp vs27, vs27, vs9
1218 xxpermdi vs25, vs8, vs0, 2
1219 xxpermdi vs24, vs10, vs2, 2
1220 xxpermdi vs27, vs0, vs8, 2
1221 xxpermdi vs26, vs2, vs10, 2
1228 /* macros for N=2 and M=2
1229 **********************************************************************************************/
1239 .macro LOAD2x2O OffsetA, OffsetB
1240 lxv vs32, (\OffsetA+0)(AO)
1241 lxv vs34, (\OffsetB+0)(BO)
1244 .macro END2x2_NORMAL
1245 END2x2 AO, BO, 16, 16
1248 .macro END2x2_WITHOUT_ADD
1252 .macro END2x2 AREG, BREG, OffsetA, OffsetB
1254 addi \BREG, \BREG, \OffsetB
1258 addi \AREG, \AREG, \OffsetA
1260 xvf32gerpp 0, 34, 32
1267 .macro LOAD2x2_2O OffsetA, OffsetB
1268 lxvp vs32, (\OffsetA)(AO)
1269 lxvp vs34, (0+\OffsetB)(BO)
1273 /*for load2 offset will be 32 and 32*/
1274 KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1
1277 .macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
1278 KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1281 .macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
1282 KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1285 .macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1286 xvf32gerpp 0, 34, 32
1287 xvf32gerpp 0, 35, 33
1289 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
1290 lxvp vs34, DISP4(\Index, \OffsetA)(\BREG)
1294 addi \AREG, \AREG, DISP4(\Index, \OffsetA)
1295 addi \BREG, \BREG, DISP4(\Index, \OffsetB)
1297 addi \AREG, \AREG, DISP4(\Index, 32)
1298 addi \BREG, \BREG, DISP4(\Index, 32)
1305 END2x2 AO, BO, 16, 16
1309 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
1317 xxperm vs0, vs32, permute_mask
1318 xxperm vs4, vs40, permute_mask
1319 xxperm vs8, vs36, permute_mask
1320 xxperm vs12, vs44, permute_mask
1321 AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
1322 AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
1323 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1324 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
1325 MULT_APLHA_PART1 vs36, vs44, vs8, vs9
1326 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
1327 MULT_APLHA_PART2 vs36, vs44, vs8, vs9
1328 /* reconstruct r, i pairs*/
1329 xxperm vs0, vs1, save_permute_1
1330 xxperm vs8, vs9, save_permute_1
1333 xxpermdi vs1, vs8, vs0, 0
1334 xxpermdi vs9, vs0, vs8, 3
1335 xvaddsp vs24, vs24, vs1
1336 xvaddsp vs26, vs26, vs9
1338 xxpermdi vs24, vs8, vs0, 0
1339 xxpermdi vs26, vs0, vs8, 3
1346 /* macros for N=2 and M=1
1347 **********************************************************************************************/
1350 xxlxor vs32, vs32, vs32
1351 xxlxor vs40, vs40, vs40
1358 .macro LOAD2x1O OffsetA, OffsetB
1359 lxsd v4, (\OffsetA+0)(AO)
1360 lxv vs0, (\OffsetB+0)(BO)
1361 xxspltd vs24, vs36, 0
1362 xxperm vs26, vs24, permute_mask
1365 .macro END2x1_NORMAL
1369 .macro END2x1_WITHOUT_ADD
1373 .macro END2x1 AREG, BREG, OffsetA, OffsetB
1375 addi \BREG, \BREG, \OffsetB
1379 addi \AREG, \AREG, \OffsetA
1381 xvmaddasp vs32, vs0, vs24
1382 xvmaddasp vs40, vs0, vs26
1389 .macro LOAD2x1_2O OffsetA, OffsetB
1390 lxv vs27, (\OffsetA)(AO)
1391 lxvp vs4, (0+\OffsetB)(BO)
1392 xxspltd vs8, vs27, 1
1393 xxspltd vs24, vs27, 0
1394 xxperm vs10, vs8, permute_mask
1395 xxperm vs26, vs24, permute_mask
1399 /*for load2 offset will be 16 and 32*/
1400 KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1
1403 .macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
1404 KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1407 .macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
1408 KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1411 .macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1412 xvmaddasp vs32, vs5, vs8
1413 xvmaddasp vs40, vs5, vs10
1415 lxv vs27, DISP2(\Index, \OffsetA)(\AREG)
1416 xxspltd vs8, vs27, 1
1419 xxperm vs10, vs8, permute_mask
1421 xvmaddasp vs32, vs4, vs24
1422 xvmaddasp vs40, vs4, vs26
1424 xxspltd vs24, vs27, 0
1425 xxperm vs26, vs24, permute_mask
1428 lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
1432 addi \AREG, \AREG, DISP2(\Index, \OffsetA)
1433 addi \BREG, \BREG, DISP4(\Index, \OffsetB)
1435 addi \AREG, \AREG, DISP2(\Index, 16)
1436 addi \BREG, \BREG, DISP4(\Index, 32)
1443 END2x1 AO, BO, 8, 16
1454 xxperm vs0, vs32, permute_mask
1455 xxperm vs4, vs40, permute_mask
1456 AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
1457 AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
1458 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1459 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
1460 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
1461 /* reconstruct r, i pairs*/
1462 xxperm vs0, vs1, save_permute_1
1467 /*--v4==vs36 v5==vs37---*/
1468 xvaddsp vs36, vs36, vs1
1469 xvaddsp vs37, vs37, vs3
1471 /*--v4==vs36 v5==vs37---*/
1472 xxspltd vs36, vs0, 0
1473 xxspltd vs37, vs0, 1
1480 /* macros for N=1 and M=8
1481 **********************************************************************************************/
1494 .macro LOAD1x8O OffsetA, OffsetB
1495 lxsd v2, (\OffsetB+0)(BO)
1496 lxvp vs32, (\OffsetA+0)(AO)
1497 lxvp vs36, (\OffsetA+32)(AO)
1500 .macro END1x8_NORMAL
1504 .macro END1x8_WITHOUT_ADD
1508 .macro END1x8 AREG, BREG, OffsetA, OffsetB
1510 addi \BREG, \BREG, \OffsetB
1514 addi \AREG, \AREG, \OffsetA
1516 xvf32gerpp 0, 34, 33
1517 xvf32gerpp 1, 34, 32
1518 xvf32gerpp 2, 34, 37
1519 xvf32gerpp 3, 34, 36
1526 .macro LOAD1x8_2O OffsetA, OffsetB
1527 lxv vs34, (\OffsetB)(BO)
1528 lxvp vs32, (0+\OffsetA)(AO)
1529 lxvp vs36, (32+\OffsetA)(AO)
1531 xxpermdi vs35, vs34, vs42, 0
1532 xxpermdi vs34, vs34, vs42, 2
1533 lxvp vs38, (64+\OffsetA)(AO)
1534 lxvp vs40, (64+32+\OffsetA)(AO)
1538 /*for load2 offset will be 128 and 16*/
1539 KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1
1542 .macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
1543 KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1546 .macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
1547 KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1550 .macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1551 xvf32gerpp 0, 34, 33
1552 xvf32gerpp 1, 34, 32
1554 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
1556 xvf32gerpp 2, 34, 37
1557 xvf32gerpp 3, 34, 36
1559 lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
1561 xvf32gerpp 0, 35, 39
1562 xvf32gerpp 1, 35, 38
1564 lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
1566 xvf32gerpp 2, 35, 41
1567 xvf32gerpp 3, 35, 40
1569 lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
1570 xxpermdi vs35, vs34, vs42, 0
1571 xxpermdi vs34, vs34, vs42, 2
1572 lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
1576 addi \BREG, \BREG, DISP2(\Index, \OffsetB)
1577 addi \AREG, \AREG, DISP16(\Index, \OffsetA)
1579 addi \BREG, \BREG, DISP2(\Index, 16)
1580 addi \AREG, \AREG, DISP16(\Index, 128)
1591 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
1592 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
1593 SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
1594 SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
1595 xxpermdi vs32, vs32, vs36, 0
1596 xxpermdi vs33, vs33, vs37, 0
1597 xxpermdi vs34, vs34, vs38, 0
1598 xxpermdi vs35, vs35, vs39, 0
1599 xxpermdi vs40, vs40, vs44, 0
1600 xxperm vs40, vs40, permute_mask
1601 xxpermdi vs41, vs41, vs45, 0
1602 xxperm vs41, vs41, permute_mask
1603 xxpermdi vs42, vs42, vs46, 0
1604 xxperm vs42, vs42, permute_mask
1605 xxpermdi vs43, vs43, vs47, 0
1606 xxperm vs43, vs43, permute_mask
1610 xxperm vs0, vs32, permute_mask
1611 xxperm vs4, vs40, permute_mask
1615 xxperm vs1, vs33, permute_mask
1616 xxperm vs5, vs41, permute_mask
1617 xxperm vs2, vs34, permute_mask
1618 xxperm vs6, vs42, permute_mask
1619 xxperm vs3, vs35, permute_mask
1620 xxperm vs7, vs43, permute_mask
1621 AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
1622 AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
1623 AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
1624 AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
1625 /*inner reverse save_permute and store vs28 */
1626 xxpermdi vs28,save_permute_1,save_permute_1, 2
1627 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1628 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
1629 MULT_APLHA_PART1 vs33, vs41, vs2, vs3
1630 MULT_APLHA_PART1 vs34, vs42, vs4, vs5
1631 MULT_APLHA_PART1 vs35, vs43, vs6, vs7
1632 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
1633 MULT_APLHA_PART2 vs33, vs41, vs2, vs3
1634 MULT_APLHA_PART2 vs34, vs42, vs4, vs5
1635 MULT_APLHA_PART2 vs35, vs43, vs6, vs7
1636 /* reconstruct r, i pairs*/
1637 xxperm vs0, vs1, vs28
1638 xxperm vs2, vs3, vs28
1639 xxperm vs4, vs5, vs28
1640 xxperm vs6, vs7, vs28
1643 xvaddsp vs24, vs24, vs2
1644 xvaddsp vs25, vs25, vs0
1645 xvaddsp vs26, vs26, vs6
1646 xvaddsp vs27, vs27, vs4
1650 /* reconstruct r, i pairs*/
1659 /* macros for N=1 and M=4
1660 **********************************************************************************************/
1671 .macro LOAD1x4O OffsetA, OffsetB
1672 lxsd v2, (\OffsetB+0)(BO)
1673 lxvp vs32, (\OffsetA+0)(AO)
1676 .macro END1x4_NORMAL
1680 .macro END1x4_WITHOUT_ADD
1684 .macro END1x4 AREG, BREG, OffsetA, OffsetB
1686 addi \BREG, \BREG, \OffsetB
1690 addi \AREG, \AREG, \OffsetA
1692 xvf32gerpp 0, 34, 33
1693 xvf32gerpp 1, 34, 32
1700 .macro LOAD1x4_2O OffsetA, OffsetB
1701 lxv vs34, (\OffsetB)(BO)
1702 lxvp vs32, (0+\OffsetA)(AO)
1704 xxpermdi vs35, vs34, vs38, 0
1705 xxpermdi vs34, vs34, vs38, 2
1706 lxvp vs36, (32+\OffsetA)(AO)
1710 /*for load2 offset will be 64 and 16*/
1711 KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1
1714 .macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
1715 KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1718 .macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
1719 KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1722 .macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1723 xvf32gerpp 0, 34, 33
1724 xvf32gerpp 1, 34, 32
1726 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
1728 xvf32gerpp 0, 35, 37
1729 xvf32gerpp 1, 35, 36
1731 lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
1732 xxpermdi vs35, vs34, vs38, 0
1733 xxpermdi vs34, vs34, vs38, 2
1734 lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
1738 addi \BREG, \BREG, DISP2(\Index, \OffsetB)
1739 addi \AREG, \AREG, DISP8(\Index, \OffsetA)
1741 addi \BREG, \BREG, DISP2(\Index, 16)
1742 addi \AREG, \AREG, DISP8(\Index, 64)
1753 SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
1754 SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
1755 xxpermdi vs32, vs32, vs36, 0
1756 xxpermdi vs40, vs40, vs44, 0
1757 xxpermdi vs33, vs33, vs37, 0
1758 xxpermdi vs41, vs41, vs45, 0
1759 xxperm vs40, vs40, permute_mask
1760 xxperm vs41, vs41, permute_mask
1764 xxperm vs0, vs32, permute_mask
1765 xxperm vs4, vs40, permute_mask
1766 xxperm vs1, vs33, permute_mask
1767 xxperm vs5, vs41, permute_mask
1768 AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
1769 AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
1770 /*inner reverse save_permute and store vs28 */
1771 xxpermdi vs28,save_permute_1,save_permute_1, 2
1772 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1773 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
1774 MULT_APLHA_PART1 vs33, vs41, vs2, vs3
1775 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
1776 MULT_APLHA_PART2 vs33, vs41, vs2, vs3
1777 /* reconstruct r, i pairs*/
1778 xxperm vs0, vs1, vs28
1779 xxperm vs2, vs3, vs28
1782 xvaddsp vs24, vs24, vs2
1783 xvaddsp vs25, vs25, vs0
1786 /* reconstruct r, i pairs*/
1793 /* macros for N=1 and M=2
1794 **********************************************************************************************/
1797 xxlxor vs32, vs32, vs32
1798 xxlxor vs40, vs40, vs40
1805 .macro LOAD1x2O OffsetA, OffsetB
1806 lxsd vs4, (\OffsetB+0)(BO)
1807 lxv vs0, (\OffsetA+0)(AO)
1808 xxspltd vs24, vs36, 0
1809 xxperm vs26, vs24, permute_mask
1812 .macro END1x2_NORMAL
1816 .macro END1x2_WITHOUT_ADD
1820 .macro END1x2 AREG, BREG, OffsetA, OffsetB
1822 addi \BREG, \BREG, \OffsetB
1825 addi \AREG, \AREG, \OffsetA
1827 xvmaddasp vs32, vs0, vs24
1828 xvmaddasp vs40, vs0, vs26
1835 .macro LOAD1x2_2O OffsetA, OffsetB
1836 lxv vs27, (\OffsetB)(BO)
1837 lxvp vs4, (0+\OffsetA)(AO)
1838 xxspltd vs8, vs27, 1
1839 xxspltd vs24, vs27, 0
1840 xxperm vs10, vs8, permute_mask
1841 xxperm vs26, vs24, permute_mask
1845 /*for load2 offset will be 32 and 16*/
1846 KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1
1849 .macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
1850 KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1853 .macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
1854 KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1857 .macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1859 lxv vs27, DISP2(\Index, \OffsetB)(\BREG)
1861 xvmaddasp vs32, vs5, vs8
1862 xvmaddasp vs40, vs5, vs10
1865 xxspltd vs8, vs27, 1
1866 xxperm vs10, vs8, permute_mask
1868 xvmaddasp vs32, vs4, vs24
1869 xvmaddasp vs40, vs4, vs26
1871 lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
1875 xxspltd vs24, vs27, 0
1876 xxperm vs26, vs24, permute_mask
1880 addi \BREG, \BREG, DISP2(\Index, \OffsetB)
1881 addi \AREG, \AREG, DISP4(\Index, \OffsetA)
1883 addi \BREG, \BREG, DISP2(\Index, 16)
1884 addi \AREG, \AREG, DISP4(\Index, 32)
1898 xxperm vs0, vs32, permute_mask
1899 xxperm vs4, vs40, permute_mask
1900 AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
1901 /*inner reverse save_permute and store vs28 */
1902 xxpermdi vs28,save_permute_1,save_permute_1, 2
1903 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
1904 MULT_APLHA_PART1 vs32, vs40, vs0, vs1
1905 MULT_APLHA_PART2 vs32, vs40, vs0, vs1
1906 /* reconstruct r, i pairs*/
1907 xxperm vs0, vs1, vs28
1910 xvaddsp vs24, vs24, vs0
1913 /* reconstruct r, i pairs*/
1919 /* macros for N=1 and M=1
1920 **********************************************************************************************/
1922 xxlxor vs32, vs32, vs32
1923 xxlxor vs40, vs40, vs40
1930 .macro LOAD1x1O OffsetA, OffsetB
1931 lxsd v4, (\OffsetB+0)(BO)
1932 lxsd v5, (\OffsetA+0)(AO)
1933 xxperm vs38, vs36, permute_mask
1936 .macro END1x1_NORMAL
1940 .macro END1x1_WITHOUT_ADD
1944 .macro END1x1 AREG, BREG, OffsetA, OffsetB
1946 addi \BREG, \BREG, \OffsetB
1949 addi \AREG, \AREG, \OffsetA
1951 xvmaddasp vs32, vs37, vs36
1952 xvmaddasp vs40, vs37, vs38
1959 .macro LOAD1x1_2O OffsetA, OffsetB
1960 lxv vs8, (\OffsetB)(BO)
1961 lxv vs4, (0+\OffsetA)(AO)
1962 xxperm vs10, vs8, permute_mask
1966 /*for load2 offset will be 16 and 16*/
1967 KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1
1970 .macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
1971 KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
1974 .macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
1975 KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
1978 .macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
1979 xvmaddasp vs32, vs4, vs8
1980 xvmaddasp vs40, vs4, vs10
1982 lxv vs8, DISP2(\Index, \OffsetB)(\BREG)
1983 lxv vs4, DISP2(\Index, \OffsetB)(\AREG)
1984 xxperm vs10, vs8, permute_mask
1988 addi \BREG, \BREG, DISP2(\Index, \OffsetB)
1989 addi \AREG, \AREG, DISP2(\Index, \OffsetA)
1991 addi \BREG, \BREG, DISP2(\Index, 16)
1992 addi \AREG, \AREG, DISP2(\Index, 16)
2007 xxpermdi vs33, vs32, vs32, 2
2008 xxpermdi vs41, vs40, vs40, 2
2009 xvaddsp vs32, vs32, vs33
2010 xvaddsp vs40, vs40, vs41
2012 xxperm vs0, vs32, permute_mask
2013 xxperm vs4, vs40, permute_mask
2014 AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
2015 /*inner reverse save_permute and store vs28 */
2016 xxpermdi vs28,save_permute_1,save_permute_1, 2
2017 /*VSINRR, VSINII, VSOUT1, VSOUT2*/
2018 MULT_APLHA_PART1 vs32, vs40, vs37, vs1
2019 MULT_APLHA_PART2 vs32, vs40, vs37, vs1
2020 /* reconstruct r, i pairs*/
2021 xxperm vs37, vs1, vs28
2024 xvaddsp vs36, vs36, vs37
2033 /****************************TRMM POINTER REFRESH MACROSES*************************/
2034 .macro SHIFT_REG REG1,REG2,SHIFT_VAL
2036 slwi \REG1, \REG2, 7
2037 .elseif \SHIFT_VAL==8
2038 slwi \REG1, \REG2, 6
2039 .elseif \SHIFT_VAL==4
2040 slwi \REG1, \REG2, 5
2041 .elseif \SHIFT_VAL==2
2042 slwi \REG1, \REG2, 4
2043 .elseif \SHIFT_VAL==1
2044 slwi \REG1, \REG2, 3
2049 //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2053 // ptrbb = bb + off*4;
2056 .macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
2057 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2059 mr \PTR_B, \B_VAL /* refresh BPOINT */
2062 // ptrba =ptrba+ off*C_A;
2063 // ptrbb = bb + off*C_B;
2065 SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */
2066 SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */
2067 add \PTR_B, \B_VAL, T4 /* Add values to BO */
2068 add \PTR_A, \PTR_A, T2 /* Add values to AO */
2073 // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2075 // #elif defined(LEFT)
2076 // temp = off+8; // number of values in A
2078 // temp = off+4; // number of values in B
2081 .macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
2082 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2084 sub \TEMP_BK, \BK_VAL, \OFF_VAL
2086 /* temp = off+INCR_A; // number of values in A */
2087 addi \TEMP_BK, \OFF_VAL, \INCR_A
2089 /* temp = off+INCR_B // number of values in B*/
2090 addi \TEMP_BK, \OFF_VAL, \INCR_B
2094 // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2097 // temp -= 8; // number of values in A
2099 // temp -= 4; // number of values in B
2106 // off += 8; // number of values in A
2109 .macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
2110 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2111 /*temp = bk - off;*/
2112 sub \TEMP_BK, \BK_VAL, \OFF_VAL
2114 /*temp -= 8; // number of values in A*/
2115 addi \TEMP_BK, \TEMP_BK,-\C_A
2117 /*temp -= 4; // number of values in B*/
2118 addi \TEMP_BK, \TEMP_BK,-\C_B
2120 /*ptrba += temp*C_A;
2121 ptrbb += temp*C_B;*/
2122 SHIFT_REG T4, \TEMP_BK, \C_A
2123 SHIFT_REG T2, \TEMP_BK, \C_B
2124 add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
2125 add \PTR_B, \PTR_B, T2
2128 /*off += 8; // number of values in A*/
2129 addi \OFF_VAL, \OFF_VAL, \C_A