1 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
3 #define XSFADD_R1 xsadddp
4 #define XSFADD_R2 xssubdp
5 #define XSFADD_I1 xsadddp
6 #define XSFADD_I2 xsadddp
8 #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
10 #define XSFADD_R1 xsadddp
11 #define XSFADD_R2 xsadddp
12 #define XSFADD_I1 xssubdp
13 #define XSFADD_I2 xsadddp
15 #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
17 #define XSFADD_R1 xsadddp
18 #define XSFADD_R2 xsadddp
19 #define XSFADD_I1 xsadddp
20 #define XSFADD_I2 xssubdp
22 #else // CC || CR || RC || RR
24 #define XSFADD_R1 xsadddp
25 #define XSFADD_R2 xssubdp
26 #define XSFADD_I1 xssubdp
27 #define XSFADD_I2 xssubdp
31 /**********************************************************************************************
32 * Macros for N=2 and M=8
33 **********************************************************************************************/
37 lxvdsx vs16, o0, BO // load real part from B
38 lxvdsx vs17, o8, BO // load imag part from B
39 lxvdsx vs18, o16, BO // load real part from B
40 lxvdsx vs19, o24, BO // load imag part from B
44 lxvd2x vs0, o0, AO // load real,imag from A
45 lxvd2x vs1, o16, AO // load real,imag from A
46 lxvd2x vs2, o32, AO // load real,imag from A
47 lxvd2x vs3, o48, AO // load real,imag from A
51 lxvd2x vs4, o0, AO // load real,imag from A
52 lxvd2x vs5, o16, AO // load real,imag from A
53 lxvd2x vs6, o32, AO // load real,imag from A
54 lxvd2x vs7, o48, AO // load real,imag from A
63 lxvd2x vs8, o0, AO // load real,imag from A
64 lxvd2x vs9, o16, AO // load real,imag from A
65 lxvd2x vs10, o32, AO // load real,imag from A
66 lxvd2x vs11, o48, AO // load real,imag from A
70 lxvd2x vs12, o0, AO // load real,imag from A
71 lxvd2x vs13, o16, AO // load real,imag from A
72 lxvd2x vs14, o32, AO // load real,imag from A
73 lxvd2x vs15, o48, AO // load real,imag from A
77 lxvdsx vs20, o0, BO // load real part from B
78 lxvdsx vs21, o8, BO // load imag part from B
79 lxvdsx vs22, o16, BO // load real part from B
80 lxvdsx vs23, o24, BO // load imag part from B
84 xvmuldp vs32, vs0, vs16 // real*real, imag*real
85 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
86 xvmuldp vs34, vs1, vs16 // real*real, imag*real
87 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
88 xvmuldp vs36, vs2, vs16 // real*real, imag*real
89 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
90 xvmuldp vs38, vs3, vs16 // real*real, imag*real
91 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
92 xvmuldp vs40, vs4, vs16 // real*real, imag*real
93 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
94 xvmuldp vs42, vs5, vs16 // real*real, imag*real
95 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
96 xvmuldp vs44, vs6, vs16 // real*real, imag*real
97 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
98 xvmuldp vs46, vs7, vs16 // real*real, imag*real
99 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
101 xvmuldp vs48, vs0, vs18 // real*real, imag*real
102 xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
103 xvmuldp vs50, vs1, vs18 // real*real, imag*real
104 xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
105 xvmuldp vs52, vs2, vs18 // real*real, imag*real
106 xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
107 xvmuldp vs54, vs3, vs18 // real*real, imag*real
108 xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
109 xvmuldp vs56, vs4, vs18 // real*real, imag*real
110 xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
111 xvmuldp vs58, vs5, vs18 // real*real, imag*real
112 xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
113 xvmuldp vs60, vs6, vs18 // real*real, imag*real
114 xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
115 xvmuldp vs62, vs7, vs18 // real*real, imag*real
116 xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
124 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
125 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
126 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
127 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
129 lxvdsx vs22, o16, BO // load real part from B
130 lxvdsx vs23, o24, BO // load imag part from B
132 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
133 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
134 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
135 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
137 lxvd2x vs8, o0, AO // load real,imag from A
138 lxvd2x vs9, o16, AO // load real,imag from A
140 xvmaddadp vs40, vs4, vs16 // real*real, imag*real
141 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
142 xvmaddadp vs42, vs5, vs16 // real*real, imag*real
143 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
145 lxvd2x vs10, o32, AO // load real,imag from A
146 lxvd2x vs11, o48, AO // load real,imag from A
148 xvmaddadp vs44, vs6, vs16 // real*real, imag*real
149 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
153 xvmaddadp vs46, vs7, vs16 // real*real, imag*real
154 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
156 xvmaddadp vs48, vs0, vs18 // real*real, imag*real
157 xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
158 xvmaddadp vs50, vs1, vs18 // real*real, imag*real
159 xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
161 lxvd2x vs12, o0, AO // load real,imag from A
162 lxvd2x vs13, o16, AO // load real,imag from A
164 xvmaddadp vs52, vs2, vs18 // real*real, imag*real
165 xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
166 xvmaddadp vs54, vs3, vs18 // real*real, imag*real
167 xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
169 lxvd2x vs14, o32, AO // load real,imag from A
170 lxvd2x vs15, o48, AO // load real,imag from A
172 xvmaddadp vs56, vs4, vs18 // real*real, imag*real
173 xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
174 xvmaddadp vs58, vs5, vs18 // real*real, imag*real
175 xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
177 lxvdsx vs20, o0, BO // load real part from B
178 lxvdsx vs21, o8, BO // load imag part from B
180 xvmaddadp vs60, vs6, vs18 // real*real, imag*real
181 xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
182 xvmaddadp vs62, vs7, vs18 // real*real, imag*real
183 xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
193 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
194 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
195 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
196 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
198 lxvdsx vs16, o0, BO // load real part from B
199 lxvdsx vs17, o8, BO // load imag part from B
201 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
202 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
203 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
204 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
206 lxvd2x vs0, o0, AO // load real,imag from A
207 lxvd2x vs1, o16, AO // load real,imag from A
209 xvmaddadp vs40, vs12, vs20 // real*real, imag*real
210 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
211 xvmaddadp vs42, vs13, vs20 // real*real, imag*real
212 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
214 lxvd2x vs2, o32, AO // load real,imag from A
215 lxvd2x vs3, o48, AO // load real,imag from A
217 xvmaddadp vs44, vs14, vs20 // real*real, imag*real
218 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
219 xvmaddadp vs46, vs15, vs20 // real*real, imag*real
220 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
224 xvmaddadp vs48, vs8, vs22 // real*real, imag*real
225 xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
226 xvmaddadp vs50, vs9, vs22 // real*real, imag*real
227 xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
229 lxvd2x vs4, o0, AO // load real,imag from A
230 lxvd2x vs5, o16, AO // load real,imag from A
232 xvmaddadp vs52, vs10, vs22 // real*real, imag*real
233 xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
234 xvmaddadp vs54, vs11, vs22 // real*real, imag*real
235 xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
237 lxvd2x vs6, o32, AO // load real,imag from A
238 lxvd2x vs7, o48, AO // load real,imag from A
240 xvmaddadp vs56, vs12, vs22 // real*real, imag*real
241 xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
242 xvmaddadp vs58, vs13, vs22 // real*real, imag*real
243 xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
245 lxvdsx vs18, o16, BO // load real part from B
246 lxvdsx vs19, o24, BO // load imag part from B
248 xvmaddadp vs60, vs14, vs22 // real*real, imag*real
249 xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
250 xvmaddadp vs62, vs15, vs22 // real*real, imag*real
251 xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
261 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
262 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
263 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
264 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
265 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
266 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
267 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
268 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
269 xvmaddadp vs40, vs12, vs20 // real*real, imag*real
270 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
271 xvmaddadp vs42, vs13, vs20 // real*real, imag*real
272 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
273 xvmaddadp vs44, vs14, vs20 // real*real, imag*real
274 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
275 xvmaddadp vs46, vs15, vs20 // real*real, imag*real
276 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
278 xvmaddadp vs48, vs8, vs22 // real*real, imag*real
279 xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
280 xvmaddadp vs50, vs9, vs22 // real*real, imag*real
281 xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
282 xvmaddadp vs52, vs10, vs22 // real*real, imag*real
283 xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
284 xvmaddadp vs54, vs11, vs22 // real*real, imag*real
285 xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
286 xvmaddadp vs56, vs12, vs22 // real*real, imag*real
287 xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
288 xvmaddadp vs58, vs13, vs22 // real*real, imag*real
289 xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
290 xvmaddadp vs60, vs14, vs22 // real*real, imag*real
291 xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
292 xvmaddadp vs62, vs15, vs22 // real*real, imag*real
293 xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
298 .macro KERNEL2x8_SUBI1
300 lxvd2x vs0, o0, AO // load real,imag from A
301 lxvd2x vs1, o16, AO // load real,imag from A
302 lxvd2x vs2, o32, AO // load real,imag from A
303 lxvd2x vs3, o48, AO // load real,imag from A
307 lxvd2x vs4, o0, AO // load real,imag from A
308 lxvd2x vs5, o16, AO // load real,imag from A
309 lxvd2x vs6, o32, AO // load real,imag from A
310 lxvd2x vs7, o48, AO // load real,imag from A
314 lxvdsx vs16, o0, BO // load real part from B
315 lxvdsx vs17, o8, BO // load imag part from B
316 lxvdsx vs18, o16, BO // load real part from B
317 lxvdsx vs19, o24, BO // load imag part from B
321 xvmuldp vs32, vs0, vs16 // real*real, imag*real
322 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
323 xvmuldp vs34, vs1, vs16 // real*real, imag*real
324 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
325 xvmuldp vs36, vs2, vs16 // real*real, imag*real
326 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
327 xvmuldp vs38, vs3, vs16 // real*real, imag*real
328 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
329 xvmuldp vs40, vs4, vs16 // real*real, imag*real
330 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
331 xvmuldp vs42, vs5, vs16 // real*real, imag*real
332 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
333 xvmuldp vs44, vs6, vs16 // real*real, imag*real
334 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
335 xvmuldp vs46, vs7, vs16 // real*real, imag*real
336 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
338 xvmuldp vs48, vs0, vs18 // real*real, imag*real
339 xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
340 xvmuldp vs50, vs1, vs18 // real*real, imag*real
341 xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
342 xvmuldp vs52, vs2, vs18 // real*real, imag*real
343 xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
344 xvmuldp vs54, vs3, vs18 // real*real, imag*real
345 xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
346 xvmuldp vs56, vs4, vs18 // real*real, imag*real
347 xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
348 xvmuldp vs58, vs5, vs18 // real*real, imag*real
349 xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
350 xvmuldp vs60, vs6, vs18 // real*real, imag*real
351 xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
352 xvmuldp vs62, vs7, vs18 // real*real, imag*real
353 xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
358 .macro KERNEL2x8_SUB1
360 lxvd2x vs0, o0, AO // load real,imag from A
361 lxvd2x vs1, o16, AO // load real,imag from A
362 lxvd2x vs2, o32, AO // load real,imag from A
363 lxvd2x vs3, o48, AO // load real,imag from A
367 lxvd2x vs4, o0, AO // load real,imag from A
368 lxvd2x vs5, o16, AO // load real,imag from A
369 lxvd2x vs6, o32, AO // load real,imag from A
370 lxvd2x vs7, o48, AO // load real,imag from A
374 lxvdsx vs16, o0, BO // load real part from B
375 lxvdsx vs17, o8, BO // load imag part from B
376 lxvdsx vs18, o16, BO // load real part from B
377 lxvdsx vs19, o24, BO // load imag part from B
381 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
382 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
383 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
384 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
385 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
386 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
387 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
388 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
389 xvmaddadp vs40, vs4, vs16 // real*real, imag*real
390 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
391 xvmaddadp vs42, vs5, vs16 // real*real, imag*real
392 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
393 xvmaddadp vs44, vs6, vs16 // real*real, imag*real
394 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
395 xvmaddadp vs46, vs7, vs16 // real*real, imag*real
396 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
398 xvmaddadp vs48, vs0, vs18 // real*real, imag*real
399 xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
400 xvmaddadp vs50, vs1, vs18 // real*real, imag*real
401 xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
402 xvmaddadp vs52, vs2, vs18 // real*real, imag*real
403 xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
404 xvmaddadp vs54, vs3, vs18 // real*real, imag*real
405 xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
406 xvmaddadp vs56, vs4, vs18 // real*real, imag*real
407 xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
408 xvmaddadp vs58, vs5, vs18 // real*real, imag*real
409 xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
410 xvmaddadp vs60, vs6, vs18 // real*real, imag*real
411 xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
412 xvmaddadp vs62, vs7, vs18 // real*real, imag*real
413 xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
440 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
442 XSFADD_R1 vs0, vs0, vs32 // realA*realB
443 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
445 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
446 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
448 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
449 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
451 xsmuldp vs4, vs0, alpha_r // real*alpha_r
452 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
453 xsmuldp vs6, vs0, alpha_i // real*alpha_i
454 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
456 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
457 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
458 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
464 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
466 XSFADD_R1 vs0, vs0, vs34 // realA*realB
467 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
469 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
470 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
472 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
473 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
475 xsmuldp vs4, vs0, alpha_r // real*alpha_r
476 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
477 xsmuldp vs6, vs0, alpha_i // real*alpha_i
478 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
480 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
481 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
482 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
488 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
490 XSFADD_R1 vs0, vs0, vs36 // realA*realB
491 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
493 xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
494 xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
496 XSFADD_I1 vs1, vs1, vs36 // realA*imagB
497 XSFADD_I2 vs1, vs1, vs37 // imagA*realB
499 xsmuldp vs4, vs0, alpha_r // real*alpha_r
500 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
501 xsmuldp vs6, vs0, alpha_i // real*alpha_i
502 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
504 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
505 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
506 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
512 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
514 XSFADD_R1 vs0, vs0, vs38 // realA*realB
515 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
517 xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
518 xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
520 XSFADD_I1 vs1, vs1, vs38 // realA*imagB
521 XSFADD_I2 vs1, vs1, vs39 // imagA*realB
523 xsmuldp vs4, vs0, alpha_r // real*alpha_r
524 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
525 xsmuldp vs6, vs0, alpha_i // real*alpha_i
526 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
528 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
529 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
530 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
536 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
538 XSFADD_R1 vs0, vs0, vs40 // realA*realB
539 XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
541 xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
542 xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
544 XSFADD_I1 vs1, vs1, vs40 // realA*imagB
545 XSFADD_I2 vs1, vs1, vs41 // imagA*realB
547 xsmuldp vs4, vs0, alpha_r // real*alpha_r
548 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
549 xsmuldp vs6, vs0, alpha_i // real*alpha_i
550 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
552 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
553 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
554 xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
560 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
562 XSFADD_R1 vs0, vs0, vs42 // realA*realB
563 XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
565 xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
566 xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
568 XSFADD_I1 vs1, vs1, vs42 // realA*imagB
569 XSFADD_I2 vs1, vs1, vs43 // imagA*realB
571 xsmuldp vs4, vs0, alpha_r // real*alpha_r
572 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
573 xsmuldp vs6, vs0, alpha_i // real*alpha_i
574 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
576 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
577 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
578 xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
584 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
586 XSFADD_R1 vs0, vs0, vs44 // realA*realB
587 XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
589 xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
590 xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
592 XSFADD_I1 vs1, vs1, vs44 // realA*imagB
593 XSFADD_I2 vs1, vs1, vs45 // imagA*realB
595 xsmuldp vs4, vs0, alpha_r // real*alpha_r
596 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
597 xsmuldp vs6, vs0, alpha_i // real*alpha_i
598 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
600 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
601 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
602 xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
608 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
610 XSFADD_R1 vs0, vs0, vs46 // realA*realB
611 XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
613 xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
614 xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
616 XSFADD_I1 vs1, vs1, vs46 // realA*imagB
617 XSFADD_I2 vs1, vs1, vs47 // imagA*realB
619 xsmuldp vs4, vs0, alpha_r // real*alpha_r
620 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
621 xsmuldp vs6, vs0, alpha_i // real*alpha_i
622 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
624 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
625 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
626 xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
631 xvadddp vs8, vs8, vs16
632 xvadddp vs9, vs9, vs17
633 xvadddp vs10, vs10, vs18
634 xvadddp vs11, vs11, vs19
635 xvadddp vs12, vs12, vs20
636 xvadddp vs13, vs13, vs21
637 xvadddp vs14, vs14, vs22
638 xvadddp vs15, vs15, vs23
644 stxvd2x vs10, o32, T1
645 stxvd2x vs11, o48, T1
647 stxvd2x vs13, o16, T2
648 stxvd2x vs14, o32, T2
649 stxvd2x vs15, o48, T2
670 xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
672 XSFADD_R1 vs0, vs0, vs48 // realA*realB
673 XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
675 xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB
676 xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
678 XSFADD_I1 vs1, vs1, vs48 // realA*imagB
679 XSFADD_I2 vs1, vs1, vs49 // imagA*realB
681 xsmuldp vs4, vs0, alpha_r // real*alpha_r
682 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
683 xsmuldp vs6, vs0, alpha_i // real*alpha_i
684 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
686 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
687 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
688 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
694 xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
696 XSFADD_R1 vs0, vs0, vs50 // realA*realB
697 XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
699 xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB
700 xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
702 XSFADD_I1 vs1, vs1, vs50 // realA*imagB
703 XSFADD_I2 vs1, vs1, vs51 // imagA*realB
705 xsmuldp vs4, vs0, alpha_r // real*alpha_r
706 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
707 xsmuldp vs6, vs0, alpha_i // real*alpha_i
708 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
710 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
711 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
712 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
718 xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
720 XSFADD_R1 vs0, vs0, vs52 // realA*realB
721 XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
723 xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB
724 xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
726 XSFADD_I1 vs1, vs1, vs52 // realA*imagB
727 XSFADD_I2 vs1, vs1, vs53 // imagA*realB
729 xsmuldp vs4, vs0, alpha_r // real*alpha_r
730 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
731 xsmuldp vs6, vs0, alpha_i // real*alpha_i
732 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
734 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
735 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
736 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
742 xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
744 XSFADD_R1 vs0, vs0, vs54 // realA*realB
745 XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
747 xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB
748 xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
750 XSFADD_I1 vs1, vs1, vs54 // realA*imagB
751 XSFADD_I2 vs1, vs1, vs55 // imagA*realB
753 xsmuldp vs4, vs0, alpha_r // real*alpha_r
754 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
755 xsmuldp vs6, vs0, alpha_i // real*alpha_i
756 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
758 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
759 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
760 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
766 xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
768 XSFADD_R1 vs0, vs0, vs56 // realA*realB
769 XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
771 xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB
772 xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
774 XSFADD_I1 vs1, vs1, vs56 // realA*imagB
775 XSFADD_I2 vs1, vs1, vs57 // imagA*realB
777 xsmuldp vs4, vs0, alpha_r // real*alpha_r
778 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
779 xsmuldp vs6, vs0, alpha_i // real*alpha_i
780 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
782 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
783 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
784 xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
790 xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
792 XSFADD_R1 vs0, vs0, vs58 // realA*realB
793 XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
795 xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB
796 xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
798 XSFADD_I1 vs1, vs1, vs58 // realA*imagB
799 XSFADD_I2 vs1, vs1, vs59 // imagA*realB
801 xsmuldp vs4, vs0, alpha_r // real*alpha_r
802 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
803 xsmuldp vs6, vs0, alpha_i // real*alpha_i
804 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
806 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
807 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
808 xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
814 xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
816 XSFADD_R1 vs0, vs0, vs60 // realA*realB
817 XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
819 xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB
820 xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
822 XSFADD_I1 vs1, vs1, vs60 // realA*imagB
823 XSFADD_I2 vs1, vs1, vs61 // imagA*realB
825 xsmuldp vs4, vs0, alpha_r // real*alpha_r
826 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
827 xsmuldp vs6, vs0, alpha_i // real*alpha_i
828 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
830 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
831 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
832 xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
838 xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
840 XSFADD_R1 vs0, vs0, vs62 // realA*realB
841 XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
843 xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB
844 xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
846 XSFADD_I1 vs1, vs1, vs62 // realA*imagB
847 XSFADD_I2 vs1, vs1, vs63 // imagA*realB
849 xsmuldp vs4, vs0, alpha_r // real*alpha_r
850 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
851 xsmuldp vs6, vs0, alpha_i // real*alpha_i
852 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
854 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
855 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
856 xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
861 xvadddp vs8, vs8, vs16
862 xvadddp vs9, vs9, vs17
863 xvadddp vs10, vs10, vs18
864 xvadddp vs11, vs11, vs19
865 xvadddp vs12, vs12, vs20
866 xvadddp vs13, vs13, vs21
867 xvadddp vs14, vs14, vs22
868 xvadddp vs15, vs15, vs23
874 stxvd2x vs10, o32, T1
875 stxvd2x vs11, o48, T1
877 stxvd2x vs13, o16, T2
878 stxvd2x vs14, o32, T2
879 stxvd2x vs15, o48, T2
888 /**********************************************************************************************
889 * Macros for N=2 and M=4
890 **********************************************************************************************/
894 lxvdsx vs16, o0, BO // load real part from B
895 lxvdsx vs17, o8, BO // load imag part from B
896 lxvdsx vs18, o16, BO // load real part from B
897 lxvdsx vs19, o24, BO // load imag part from B
901 lxvd2x vs0, o0, AO // load real,imag from A
902 lxvd2x vs1, o16, AO // load real,imag from A
903 lxvd2x vs2, o32, AO // load real,imag from A
904 lxvd2x vs3, o48, AO // load real,imag from A
913 lxvd2x vs8, o0, AO // load real,imag from A
914 lxvd2x vs9, o16, AO // load real,imag from A
915 lxvd2x vs10, o32, AO // load real,imag from A
916 lxvd2x vs11, o48, AO // load real,imag from A
920 lxvdsx vs20, o0, BO // load real part from B
921 lxvdsx vs21, o8, BO // load imag part from B
922 lxvdsx vs22, o16, BO // load real part from B
923 lxvdsx vs23, o24, BO // load imag part from B
927 xvmuldp vs32, vs0, vs16 // real*real, imag*real
928 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
929 xvmuldp vs34, vs1, vs16 // real*real, imag*real
930 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
931 xvmuldp vs36, vs2, vs16 // real*real, imag*real
932 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
933 xvmuldp vs38, vs3, vs16 // real*real, imag*real
934 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
936 xvmuldp vs40, vs0, vs18 // real*real, imag*real
937 xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
938 xvmuldp vs42, vs1, vs18 // real*real, imag*real
939 xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
940 xvmuldp vs44, vs2, vs18 // real*real, imag*real
941 xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
942 xvmuldp vs46, vs3, vs18 // real*real, imag*real
943 xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
950 lxvd2x vs8, o0, AO // load real,imag from A
951 lxvd2x vs9, o16, AO // load real,imag from A
952 lxvd2x vs10, o32, AO // load real,imag from A
953 lxvd2x vs11, o48, AO // load real,imag from A
957 lxvdsx vs20, o0, BO // load real part from B
958 lxvdsx vs21, o8, BO // load imag part from B
959 lxvdsx vs22, o16, BO // load real part from B
960 lxvdsx vs23, o24, BO // load imag part from B
964 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
965 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
966 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
967 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
968 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
969 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
970 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
971 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
973 xvmaddadp vs40, vs0, vs18 // real*real, imag*real
974 xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
975 xvmaddadp vs42, vs1, vs18 // real*real, imag*real
976 xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
977 xvmaddadp vs44, vs2, vs18 // real*real, imag*real
978 xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
979 xvmaddadp vs46, vs3, vs18 // real*real, imag*real
980 xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
987 lxvd2x vs0, o0, AO // load real,imag from A
988 lxvd2x vs1, o16, AO // load real,imag from A
989 lxvd2x vs2, o32, AO // load real,imag from A
990 lxvd2x vs3, o48, AO // load real,imag from A
994 lxvdsx vs16, o0, BO // load real part from B
995 lxvdsx vs17, o8, BO // load imag part from B
996 lxvdsx vs18, o16, BO // load real part from B
997 lxvdsx vs19, o24, BO // load imag part from B
1001 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
1002 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
1003 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
1004 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
1005 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
1006 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
1007 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
1008 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
1010 xvmaddadp vs40, vs8, vs22 // real*real, imag*real
1011 xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
1012 xvmaddadp vs42, vs9, vs22 // real*real, imag*real
1013 xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
1014 xvmaddadp vs44, vs10, vs22 // real*real, imag*real
1015 xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
1016 xvmaddadp vs46, vs11, vs22 // real*real, imag*real
1017 xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
1025 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
1026 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
1027 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
1028 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
1029 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
1030 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
1031 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
1032 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
1034 xvmaddadp vs40, vs8, vs22 // real*real, imag*real
1035 xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
1036 xvmaddadp vs42, vs9, vs22 // real*real, imag*real
1037 xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
1038 xvmaddadp vs44, vs10, vs22 // real*real, imag*real
1039 xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
1040 xvmaddadp vs46, vs11, vs22 // real*real, imag*real
1041 xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
1046 .macro KERNEL2x4_SUBI1
1048 lxvd2x vs0, o0, AO // load real,imag from A
1049 lxvd2x vs1, o16, AO // load real,imag from A
1050 lxvd2x vs2, o32, AO // load real,imag from A
1051 lxvd2x vs3, o48, AO // load real,imag from A
1055 lxvdsx vs16, o0, BO // load real part from B
1056 lxvdsx vs17, o8, BO // load imag part from B
1057 lxvdsx vs18, o16, BO // load real part from B
1058 lxvdsx vs19, o24, BO // load imag part from B
1062 xvmuldp vs32, vs0, vs16 // real*real, imag*real
1063 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
1064 xvmuldp vs34, vs1, vs16 // real*real, imag*real
1065 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
1066 xvmuldp vs36, vs2, vs16 // real*real, imag*real
1067 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
1068 xvmuldp vs38, vs3, vs16 // real*real, imag*real
1069 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
1071 xvmuldp vs40, vs0, vs18 // real*real, imag*real
1072 xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
1073 xvmuldp vs42, vs1, vs18 // real*real, imag*real
1074 xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
1075 xvmuldp vs44, vs2, vs18 // real*real, imag*real
1076 xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
1077 xvmuldp vs46, vs3, vs18 // real*real, imag*real
1078 xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
1083 .macro KERNEL2x4_SUB1
1085 lxvd2x vs0, o0, AO // load real,imag from A
1086 lxvd2x vs1, o16, AO // load real,imag from A
1087 lxvd2x vs2, o32, AO // load real,imag from A
1088 lxvd2x vs3, o48, AO // load real,imag from A
1092 lxvdsx vs16, o0, BO // load real part from B
1093 lxvdsx vs17, o8, BO // load imag part from B
1094 lxvdsx vs18, o16, BO // load real part from B
1095 lxvdsx vs19, o24, BO // load imag part from B
1099 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
1100 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
1101 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
1102 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
1103 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
1104 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
1105 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
1106 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
1108 xvmaddadp vs40, vs0, vs18 // real*real, imag*real
1109 xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
1110 xvmaddadp vs42, vs1, vs18 // real*real, imag*real
1111 xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
1112 xvmaddadp vs44, vs2, vs18 // real*real, imag*real
1113 xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
1114 xvmaddadp vs46, vs3, vs18 // real*real, imag*real
1115 xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
1128 lxvd2x vs17, o16, T1
1129 lxvd2x vs18, o32, T1
1130 lxvd2x vs19, o48, T1
1135 xxlxor vs0, vs0, vs0
1136 xxlxor vs1, vs1, vs1
1137 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1139 XSFADD_R1 vs0, vs0, vs32 // realA*realB
1140 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
1142 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1143 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1145 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
1146 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
1148 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1149 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1150 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1151 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1153 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1154 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1155 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
1159 xxlxor vs0, vs0, vs0
1160 xxlxor vs1, vs1, vs1
1161 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1163 XSFADD_R1 vs0, vs0, vs34 // realA*realB
1164 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
1166 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1167 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1169 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
1170 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
1172 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1173 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1174 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1175 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1177 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1178 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1179 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
1183 xxlxor vs0, vs0, vs0
1184 xxlxor vs1, vs1, vs1
1185 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1187 XSFADD_R1 vs0, vs0, vs36 // realA*realB
1188 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
1190 xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1191 xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1193 XSFADD_I1 vs1, vs1, vs36 // realA*imagB
1194 XSFADD_I2 vs1, vs1, vs37 // imagA*realB
1196 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1197 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1198 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1199 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1201 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1202 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1203 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
1207 xxlxor vs0, vs0, vs0
1208 xxlxor vs1, vs1, vs1
1209 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1211 XSFADD_R1 vs0, vs0, vs38 // realA*realB
1212 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
1214 xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1215 xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1217 XSFADD_I1 vs1, vs1, vs38 // realA*imagB
1218 XSFADD_I2 vs1, vs1, vs39 // imagA*realB
1220 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1221 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1222 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1223 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1225 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1226 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1227 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
1232 xvadddp vs8, vs8, vs16
1233 xvadddp vs9, vs9, vs17
1234 xvadddp vs10, vs10, vs18
1235 xvadddp vs11, vs11, vs19
1240 stxvd2x vs9, o16, T1
1241 stxvd2x vs10, o32, T1
1242 stxvd2x vs11, o48, T1
1249 lxvd2x vs17, o16, T1
1250 lxvd2x vs18, o32, T1
1251 lxvd2x vs19, o48, T1
1256 xxlxor vs0, vs0, vs0
1257 xxlxor vs1, vs1, vs1
1258 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1260 XSFADD_R1 vs0, vs0, vs40 // realA*realB
1261 XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
1263 xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1264 xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1266 XSFADD_I1 vs1, vs1, vs40 // realA*imagB
1267 XSFADD_I2 vs1, vs1, vs41 // imagA*realB
1269 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1270 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1271 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1272 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1274 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1275 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1276 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
1280 xxlxor vs0, vs0, vs0
1281 xxlxor vs1, vs1, vs1
1282 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1284 XSFADD_R1 vs0, vs0, vs42 // realA*realB
1285 XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
1287 xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1288 xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1290 XSFADD_I1 vs1, vs1, vs42 // realA*imagB
1291 XSFADD_I2 vs1, vs1, vs43 // imagA*realB
1293 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1294 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1295 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1296 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1298 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1299 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1300 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
1304 xxlxor vs0, vs0, vs0
1305 xxlxor vs1, vs1, vs1
1306 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1308 XSFADD_R1 vs0, vs0, vs44 // realA*realB
1309 XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
1311 xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1312 xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1314 XSFADD_I1 vs1, vs1, vs44 // realA*imagB
1315 XSFADD_I2 vs1, vs1, vs45 // imagA*realB
1317 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1318 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1319 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1320 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1322 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1323 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1324 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
1328 xxlxor vs0, vs0, vs0
1329 xxlxor vs1, vs1, vs1
1330 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1332 XSFADD_R1 vs0, vs0, vs46 // realA*realB
1333 XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
1335 xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1336 xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1338 XSFADD_I1 vs1, vs1, vs46 // realA*imagB
1339 XSFADD_I2 vs1, vs1, vs47 // imagA*realB
1341 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1342 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1343 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1344 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1346 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1347 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1348 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
1353 xvadddp vs8, vs8, vs16
1354 xvadddp vs9, vs9, vs17
1355 xvadddp vs10, vs10, vs18
1356 xvadddp vs11, vs11, vs19
1361 stxvd2x vs9, o16, T1
1362 stxvd2x vs10, o32, T1
1363 stxvd2x vs11, o48, T1
1371 /**********************************************************************************************
1372 * Macros for N=2 and M=2
1373 **********************************************************************************************/
1377 lxvdsx vs16, o0, BO // load real part from B
1378 lxvdsx vs17, o8, BO // load imag part from B
1379 lxvdsx vs18, o16, BO // load real part from B
1380 lxvdsx vs19, o24, BO // load imag part from B
1384 lxvd2x vs0, o0, AO // load real,imag from A
1385 lxvd2x vs1, o16, AO // load real,imag from A
1394 lxvd2x vs8, o0, AO // load real,imag from A
1395 lxvd2x vs9, o16, AO // load real,imag from A
1399 lxvdsx vs20, o0, BO // load real part from B
1400 lxvdsx vs21, o8, BO // load imag part from B
1401 lxvdsx vs22, o16, BO // load real part from B
1402 lxvdsx vs23, o24, BO // load imag part from B
1406 xvmuldp vs32, vs0, vs16 // real*real, imag*real
1407 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
1408 xvmuldp vs34, vs1, vs16 // real*real, imag*real
1409 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
1411 xvmuldp vs36, vs0, vs18 // real*real, imag*real
1412 xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
1413 xvmuldp vs38, vs1, vs18 // real*real, imag*real
1414 xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
1421 lxvd2x vs8, o0, AO // load real,imag from A
1422 lxvd2x vs9, o16, AO // load real,imag from A
1426 lxvdsx vs20, o0, BO // load real part from B
1427 lxvdsx vs21, o8, BO // load imag part from B
1428 lxvdsx vs22, o16, BO // load real part from B
1429 lxvdsx vs23, o24, BO // load imag part from B
1433 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
1434 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
1435 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
1436 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
1438 xvmaddadp vs36, vs0, vs18 // real*real, imag*real
1439 xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
1440 xvmaddadp vs38, vs1, vs18 // real*real, imag*real
1441 xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
1448 lxvd2x vs0, o0, AO // load real,imag from A
1449 lxvd2x vs1, o16, AO // load real,imag from A
1453 lxvdsx vs16, o0, BO // load real part from B
1454 lxvdsx vs17, o8, BO // load imag part from B
1455 lxvdsx vs18, o16, BO // load real part from B
1456 lxvdsx vs19, o24, BO // load imag part from B
1460 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
1461 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
1462 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
1463 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
1465 xvmaddadp vs36, vs8, vs22 // real*real, imag*real
1466 xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
1467 xvmaddadp vs38, vs9, vs22 // real*real, imag*real
1468 xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
1476 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
1477 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
1478 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
1479 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
1481 xvmaddadp vs36, vs8, vs22 // real*real, imag*real
1482 xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
1483 xvmaddadp vs38, vs9, vs22 // real*real, imag*real
1484 xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
1489 .macro KERNEL2x2_SUBI1
1491 lxvd2x vs0, o0, AO // load real,imag from A
1492 lxvd2x vs1, o16, AO // load real,imag from A
1496 lxvdsx vs16, o0, BO // load real part from B
1497 lxvdsx vs17, o8, BO // load imag part from B
1498 lxvdsx vs18, o16, BO // load real part from B
1499 lxvdsx vs19, o24, BO // load imag part from B
1503 xvmuldp vs32, vs0, vs16 // real*real, imag*real
1504 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
1505 xvmuldp vs34, vs1, vs16 // real*real, imag*real
1506 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
1508 xvmuldp vs36, vs0, vs18 // real*real, imag*real
1509 xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
1510 xvmuldp vs38, vs1, vs18 // real*real, imag*real
1511 xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
1516 .macro KERNEL2x2_SUB1
1518 lxvd2x vs0, o0, AO // load real,imag from A
1519 lxvd2x vs1, o16, AO // load real,imag from A
1523 lxvdsx vs16, o0, BO // load real part from B
1524 lxvdsx vs17, o8, BO // load imag part from B
1525 lxvdsx vs18, o16, BO // load real part from B
1526 lxvdsx vs19, o24, BO // load imag part from B
1530 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
1531 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
1532 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
1533 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
1535 xvmaddadp vs36, vs0, vs18 // real*real, imag*real
1536 xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
1537 xvmaddadp vs38, vs1, vs18 // real*real, imag*real
1538 xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
1551 lxvd2x vs17, o16, T1
1556 xxlxor vs0, vs0, vs0
1557 xxlxor vs1, vs1, vs1
1558 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1560 XSFADD_R1 vs0, vs0, vs32 // realA*realB
1561 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
1563 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1564 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1566 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
1567 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
1569 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1570 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1571 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1572 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1574 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1575 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1576 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
1580 xxlxor vs0, vs0, vs0
1581 xxlxor vs1, vs1, vs1
1582 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1584 XSFADD_R1 vs0, vs0, vs34 // realA*realB
1585 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
1587 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1588 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1590 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
1591 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
1593 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1594 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1595 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1596 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1598 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1599 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1600 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
1605 xvadddp vs8, vs8, vs16
1606 xvadddp vs9, vs9, vs17
1611 stxvd2x vs9, o16, T1
1618 lxvd2x vs17, o16, T1
1623 xxlxor vs0, vs0, vs0
1624 xxlxor vs1, vs1, vs1
1625 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1627 XSFADD_R1 vs0, vs0, vs36 // realA*realB
1628 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
1630 xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1631 xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1633 XSFADD_I1 vs1, vs1, vs36 // realA*imagB
1634 XSFADD_I2 vs1, vs1, vs37 // imagA*realB
1636 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1637 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1638 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1639 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1641 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1642 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1643 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
1647 xxlxor vs0, vs0, vs0
1648 xxlxor vs1, vs1, vs1
1649 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1651 XSFADD_R1 vs0, vs0, vs38 // realA*realB
1652 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
1654 xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1655 xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1657 XSFADD_I1 vs1, vs1, vs38 // realA*imagB
1658 XSFADD_I2 vs1, vs1, vs39 // imagA*realB
1660 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1661 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1662 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1663 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1665 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1666 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1667 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
1672 xvadddp vs8, vs8, vs16
1673 xvadddp vs9, vs9, vs17
1678 stxvd2x vs9, o16, T1
1686 /**********************************************************************************************
1687 * Macros for N=2 and M=1
1688 **********************************************************************************************/
1692 lxvdsx vs16, o0, BO // load real part from B
1693 lxvdsx vs17, o8, BO // load imag part from B
1694 lxvdsx vs18, o16, BO // load real part from B
1695 lxvdsx vs19, o24, BO // load imag part from B
1699 lxvd2x vs0, o0, AO // load real,imag from A
1708 lxvd2x vs8, o0, AO // load real,imag from A
1712 lxvdsx vs20, o0, BO // load real part from B
1713 lxvdsx vs21, o8, BO // load imag part from B
1714 lxvdsx vs22, o16, BO // load real part from B
1715 lxvdsx vs23, o24, BO // load imag part from B
1719 xvmuldp vs32, vs0, vs16 // real*real, imag*real
1720 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
1722 xvmuldp vs34, vs0, vs18 // real*real, imag*real
1723 xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
1730 lxvd2x vs8, o0, AO // load real,imag from A
1734 lxvdsx vs20, o0, BO // load real part from B
1735 lxvdsx vs21, o8, BO // load imag part from B
1736 lxvdsx vs22, o16, BO // load real part from B
1737 lxvdsx vs23, o24, BO // load imag part from B
1741 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
1742 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
1744 xvmaddadp vs34, vs0, vs18 // real*real, imag*real
1745 xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
1752 lxvd2x vs0, o0, AO // load real,imag from A
1756 lxvdsx vs16, o0, BO // load real part from B
1757 lxvdsx vs17, o8, BO // load imag part from B
1758 lxvdsx vs18, o16, BO // load real part from B
1759 lxvdsx vs19, o24, BO // load imag part from B
1763 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
1764 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
1766 xvmaddadp vs34, vs8, vs22 // real*real, imag*real
1767 xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
1775 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
1776 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
1778 xvmaddadp vs34, vs8, vs22 // real*real, imag*real
1779 xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
1784 .macro KERNEL2x1_SUBI1
1786 lxvd2x vs0, o0, AO // load real,imag from A
1790 lxvdsx vs16, o0, BO // load real part from B
1791 lxvdsx vs17, o8, BO // load imag part from B
1792 lxvdsx vs18, o16, BO // load real part from B
1793 lxvdsx vs19, o24, BO // load imag part from B
1797 xvmuldp vs32, vs0, vs16 // real*real, imag*real
1798 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
1800 xvmuldp vs34, vs0, vs18 // real*real, imag*real
1801 xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
1806 .macro KERNEL2x1_SUB1
1808 lxvd2x vs0, o0, AO // load real,imag from A
1812 lxvdsx vs16, o0, BO // load real part from B
1813 lxvdsx vs17, o8, BO // load imag part from B
1814 lxvdsx vs18, o16, BO // load real part from B
1815 lxvdsx vs19, o24, BO // load imag part from B
1819 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
1820 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
1822 xvmaddadp vs34, vs0, vs18 // real*real, imag*real
1823 xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
1840 xxlxor vs0, vs0, vs0
1841 xxlxor vs1, vs1, vs1
1842 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1844 XSFADD_R1 vs0, vs0, vs32 // realA*realB
1845 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
1847 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1848 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1850 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
1851 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
1853 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1854 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1855 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1856 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1858 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1859 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1860 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
1865 xvadddp vs8, vs8, vs16
1880 xxlxor vs0, vs0, vs0
1881 xxlxor vs1, vs1, vs1
1882 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1884 XSFADD_R1 vs0, vs0, vs34 // realA*realB
1885 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
1887 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
1888 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1890 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
1891 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
1893 xsmuldp vs4, vs0, alpha_r // real*alpha_r
1894 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
1895 xsmuldp vs6, vs0, alpha_i // real*alpha_i
1896 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
1898 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
1899 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
1900 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
1905 xvadddp vs8, vs8, vs16
1917 /**********************************************************************************************
1918 * Macros for N=1 and M=8
1919 **********************************************************************************************/
1923 lxvdsx vs16, o0, BO // load real part from B
1924 lxvdsx vs17, o8, BO // load imag part from B
1928 lxvd2x vs0, o0, AO // load real,imag from A
1929 lxvd2x vs1, o16, AO // load real,imag from A
1930 lxvd2x vs2, o32, AO // load real,imag from A
1931 lxvd2x vs3, o48, AO // load real,imag from A
1935 lxvd2x vs4, o0, AO // load real,imag from A
1936 lxvd2x vs5, o16, AO // load real,imag from A
1937 lxvd2x vs6, o32, AO // load real,imag from A
1938 lxvd2x vs7, o48, AO // load real,imag from A
1947 lxvd2x vs8, o0, AO // load real,imag from A
1948 lxvd2x vs9, o16, AO // load real,imag from A
1949 lxvd2x vs10, o32, AO // load real,imag from A
1950 lxvd2x vs11, o48, AO // load real,imag from A
1954 lxvd2x vs12, o0, AO // load real,imag from A
1955 lxvd2x vs13, o16, AO // load real,imag from A
1956 lxvd2x vs14, o32, AO // load real,imag from A
1957 lxvd2x vs15, o48, AO // load real,imag from A
1961 lxvdsx vs20, o0, BO // load real part from B
1962 lxvdsx vs21, o8, BO // load imag part from B
1966 xvmuldp vs32, vs0, vs16 // real*real, imag*real
1967 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
1968 xvmuldp vs34, vs1, vs16 // real*real, imag*real
1969 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
1970 xvmuldp vs36, vs2, vs16 // real*real, imag*real
1971 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
1972 xvmuldp vs38, vs3, vs16 // real*real, imag*real
1973 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
1974 xvmuldp vs40, vs4, vs16 // real*real, imag*real
1975 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
1976 xvmuldp vs42, vs5, vs16 // real*real, imag*real
1977 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
1978 xvmuldp vs44, vs6, vs16 // real*real, imag*real
1979 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
1980 xvmuldp vs46, vs7, vs16 // real*real, imag*real
1981 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
1988 lxvd2x vs8, o0, AO // load real,imag from A
1989 lxvd2x vs9, o16, AO // load real,imag from A
1990 lxvd2x vs10, o32, AO // load real,imag from A
1991 lxvd2x vs11, o48, AO // load real,imag from A
1995 lxvd2x vs12, o0, AO // load real,imag from A
1996 lxvd2x vs13, o16, AO // load real,imag from A
1997 lxvd2x vs14, o32, AO // load real,imag from A
1998 lxvd2x vs15, o48, AO // load real,imag from A
2002 lxvdsx vs20, o0, BO // load real part from B
2003 lxvdsx vs21, o8, BO // load imag part from B
2007 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2008 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2009 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
2010 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
2011 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
2012 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
2013 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
2014 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
2015 xvmaddadp vs40, vs4, vs16 // real*real, imag*real
2016 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
2017 xvmaddadp vs42, vs5, vs16 // real*real, imag*real
2018 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
2019 xvmaddadp vs44, vs6, vs16 // real*real, imag*real
2020 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
2021 xvmaddadp vs46, vs7, vs16 // real*real, imag*real
2022 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
2029 lxvd2x vs0, o0, AO // load real,imag from A
2030 lxvd2x vs1, o16, AO // load real,imag from A
2031 lxvd2x vs2, o32, AO // load real,imag from A
2032 lxvd2x vs3, o48, AO // load real,imag from A
2036 lxvd2x vs4, o0, AO // load real,imag from A
2037 lxvd2x vs5, o16, AO // load real,imag from A
2038 lxvd2x vs6, o32, AO // load real,imag from A
2039 lxvd2x vs7, o48, AO // load real,imag from A
2043 lxvdsx vs16, o0, BO // load real part from B
2044 lxvdsx vs17, o8, BO // load imag part from B
2048 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2049 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2050 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
2051 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
2052 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
2053 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
2054 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
2055 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
2056 xvmaddadp vs40, vs12, vs20 // real*real, imag*real
2057 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
2058 xvmaddadp vs42, vs13, vs20 // real*real, imag*real
2059 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
2060 xvmaddadp vs44, vs14, vs20 // real*real, imag*real
2061 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
2062 xvmaddadp vs46, vs15, vs20 // real*real, imag*real
2063 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
2071 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2072 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2073 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
2074 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
2075 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
2076 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
2077 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
2078 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
2079 xvmaddadp vs40, vs12, vs20 // real*real, imag*real
2080 xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
2081 xvmaddadp vs42, vs13, vs20 // real*real, imag*real
2082 xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
2083 xvmaddadp vs44, vs14, vs20 // real*real, imag*real
2084 xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
2085 xvmaddadp vs46, vs15, vs20 // real*real, imag*real
2086 xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
2091 .macro KERNEL1x8_SUBI1
2093 lxvd2x vs0, o0, AO // load real,imag from A
2094 lxvd2x vs1, o16, AO // load real,imag from A
2095 lxvd2x vs2, o32, AO // load real,imag from A
2096 lxvd2x vs3, o48, AO // load real,imag from A
2100 lxvd2x vs4, o0, AO // load real,imag from A
2101 lxvd2x vs5, o16, AO // load real,imag from A
2102 lxvd2x vs6, o32, AO // load real,imag from A
2103 lxvd2x vs7, o48, AO // load real,imag from A
2107 lxvdsx vs16, o0, BO // load real part from B
2108 lxvdsx vs17, o8, BO // load imag part from B
2112 xvmuldp vs32, vs0, vs16 // real*real, imag*real
2113 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
2114 xvmuldp vs34, vs1, vs16 // real*real, imag*real
2115 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
2116 xvmuldp vs36, vs2, vs16 // real*real, imag*real
2117 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
2118 xvmuldp vs38, vs3, vs16 // real*real, imag*real
2119 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
2120 xvmuldp vs40, vs4, vs16 // real*real, imag*real
2121 xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
2122 xvmuldp vs42, vs5, vs16 // real*real, imag*real
2123 xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
2124 xvmuldp vs44, vs6, vs16 // real*real, imag*real
2125 xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
2126 xvmuldp vs46, vs7, vs16 // real*real, imag*real
2127 xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
2132 .macro KERNEL1x8_SUB1
2134 lxvd2x vs0, o0, AO // load real,imag from A
2135 lxvd2x vs1, o16, AO // load real,imag from A
2136 lxvd2x vs2, o32, AO // load real,imag from A
2137 lxvd2x vs3, o48, AO // load real,imag from A
2141 lxvd2x vs4, o0, AO // load real,imag from A
2142 lxvd2x vs5, o16, AO // load real,imag from A
2143 lxvd2x vs6, o32, AO // load real,imag from A
2144 lxvd2x vs7, o48, AO // load real,imag from A
2148 lxvdsx vs16, o0, BO // load real part from B
2149 lxvdsx vs17, o8, BO // load imag part from B
2153 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2154 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2155 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
2156 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
2157 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
2158 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
2159 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
2160 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
2161 xvmaddadp vs40, vs4, vs16 // real*real, imag*real
2162 xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
2163 xvmaddadp vs42, vs5, vs16 // real*real, imag*real
2164 xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
2165 xvmaddadp vs44, vs6, vs16 // real*real, imag*real
2166 xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
2167 xvmaddadp vs46, vs7, vs16 // real*real, imag*real
2168 xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
2182 lxvd2x vs17, o16, T1
2183 lxvd2x vs18, o32, T1
2184 lxvd2x vs19, o48, T1
2186 lxvd2x vs21, o16, T2
2187 lxvd2x vs22, o32, T2
2188 lxvd2x vs23, o48, T2
2193 xxlxor vs0, vs0, vs0
2194 xxlxor vs1, vs1, vs1
2195 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2197 XSFADD_R1 vs0, vs0, vs32 // realA*realB
2198 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
2200 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2201 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2203 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
2204 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
2206 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2207 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2208 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2209 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2211 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2212 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2213 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
2217 xxlxor vs0, vs0, vs0
2218 xxlxor vs1, vs1, vs1
2219 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2221 XSFADD_R1 vs0, vs0, vs34 // realA*realB
2222 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
2224 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2225 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2227 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
2228 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
2230 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2231 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2232 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2233 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2235 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2236 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2237 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
2241 xxlxor vs0, vs0, vs0
2242 xxlxor vs1, vs1, vs1
2243 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2245 XSFADD_R1 vs0, vs0, vs36 // realA*realB
2246 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
2248 xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2249 xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2251 XSFADD_I1 vs1, vs1, vs36 // realA*imagB
2252 XSFADD_I2 vs1, vs1, vs37 // imagA*realB
2254 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2255 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2256 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2257 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2259 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2260 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2261 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
2265 xxlxor vs0, vs0, vs0
2266 xxlxor vs1, vs1, vs1
2267 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2269 XSFADD_R1 vs0, vs0, vs38 // realA*realB
2270 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
2272 xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2273 xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2275 XSFADD_I1 vs1, vs1, vs38 // realA*imagB
2276 XSFADD_I2 vs1, vs1, vs39 // imagA*realB
2278 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2279 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2280 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2281 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2283 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2284 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2285 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
2289 xxlxor vs0, vs0, vs0
2290 xxlxor vs1, vs1, vs1
2291 xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2293 XSFADD_R1 vs0, vs0, vs40 // realA*realB
2294 XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
2296 xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2297 xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2299 XSFADD_I1 vs1, vs1, vs40 // realA*imagB
2300 XSFADD_I2 vs1, vs1, vs41 // imagA*realB
2302 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2303 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2304 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2305 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2307 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2308 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2309 xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
2313 xxlxor vs0, vs0, vs0
2314 xxlxor vs1, vs1, vs1
2315 xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2317 XSFADD_R1 vs0, vs0, vs42 // realA*realB
2318 XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
2320 xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2321 xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2323 XSFADD_I1 vs1, vs1, vs42 // realA*imagB
2324 XSFADD_I2 vs1, vs1, vs43 // imagA*realB
2326 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2327 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2328 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2329 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2331 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2332 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2333 xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
2337 xxlxor vs0, vs0, vs0
2338 xxlxor vs1, vs1, vs1
2339 xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2341 XSFADD_R1 vs0, vs0, vs44 // realA*realB
2342 XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
2344 xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2345 xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2347 XSFADD_I1 vs1, vs1, vs44 // realA*imagB
2348 XSFADD_I2 vs1, vs1, vs45 // imagA*realB
2350 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2351 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2352 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2353 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2355 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2356 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2357 xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
2361 xxlxor vs0, vs0, vs0
2362 xxlxor vs1, vs1, vs1
2363 xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2365 XSFADD_R1 vs0, vs0, vs46 // realA*realB
2366 XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
2368 xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2369 xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2371 XSFADD_I1 vs1, vs1, vs46 // realA*imagB
2372 XSFADD_I2 vs1, vs1, vs47 // imagA*realB
2374 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2375 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2376 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2377 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2379 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2380 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2381 xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
2386 xvadddp vs8, vs8, vs16
2387 xvadddp vs9, vs9, vs17
2388 xvadddp vs10, vs10, vs18
2389 xvadddp vs11, vs11, vs19
2390 xvadddp vs12, vs12, vs20
2391 xvadddp vs13, vs13, vs21
2392 xvadddp vs14, vs14, vs22
2393 xvadddp vs15, vs15, vs23
2398 stxvd2x vs9, o16, T1
2399 stxvd2x vs10, o32, T1
2400 stxvd2x vs11, o48, T1
2401 stxvd2x vs12, o0, T2
2402 stxvd2x vs13, o16, T2
2403 stxvd2x vs14, o32, T2
2404 stxvd2x vs15, o48, T2
2413 /**********************************************************************************************
2414 * Macros for N=1 and M=4
2415 **********************************************************************************************/
2419 lxvdsx vs16, o0, BO // load real part from B
2420 lxvdsx vs17, o8, BO // load imag part from B
2424 lxvd2x vs0, o0, AO // load real,imag from A
2425 lxvd2x vs1, o16, AO // load real,imag from A
2426 lxvd2x vs2, o32, AO // load real,imag from A
2427 lxvd2x vs3, o48, AO // load real,imag from A
2436 lxvd2x vs8, o0, AO // load real,imag from A
2437 lxvd2x vs9, o16, AO // load real,imag from A
2438 lxvd2x vs10, o32, AO // load real,imag from A
2439 lxvd2x vs11, o48, AO // load real,imag from A
2443 lxvdsx vs20, o0, BO // load real part from B
2444 lxvdsx vs21, o8, BO // load imag part from B
2448 xvmuldp vs32, vs0, vs16 // real*real, imag*real
2449 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
2450 xvmuldp vs34, vs1, vs16 // real*real, imag*real
2451 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
2452 xvmuldp vs36, vs2, vs16 // real*real, imag*real
2453 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
2454 xvmuldp vs38, vs3, vs16 // real*real, imag*real
2455 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
2462 lxvd2x vs8, o0, AO // load real,imag from A
2463 lxvd2x vs9, o16, AO // load real,imag from A
2464 lxvd2x vs10, o32, AO // load real,imag from A
2465 lxvd2x vs11, o48, AO // load real,imag from A
2469 lxvdsx vs20, o0, BO // load real part from B
2470 lxvdsx vs21, o8, BO // load imag part from B
2474 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2475 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2476 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
2477 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
2478 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
2479 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
2480 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
2481 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
2488 lxvd2x vs0, o0, AO // load real,imag from A
2489 lxvd2x vs1, o16, AO // load real,imag from A
2490 lxvd2x vs2, o32, AO // load real,imag from A
2491 lxvd2x vs3, o48, AO // load real,imag from A
2495 lxvdsx vs16, o0, BO // load real part from B
2496 lxvdsx vs17, o8, BO // load imag part from B
2500 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2501 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2502 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
2503 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
2504 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
2505 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
2506 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
2507 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
2515 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2516 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2517 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
2518 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
2519 xvmaddadp vs36, vs10, vs20 // real*real, imag*real
2520 xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
2521 xvmaddadp vs38, vs11, vs20 // real*real, imag*real
2522 xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
2527 .macro KERNEL1x4_SUBI1
2529 lxvd2x vs0, o0, AO // load real,imag from A
2530 lxvd2x vs1, o16, AO // load real,imag from A
2531 lxvd2x vs2, o32, AO // load real,imag from A
2532 lxvd2x vs3, o48, AO // load real,imag from A
2536 lxvdsx vs16, o0, BO // load real part from B
2537 lxvdsx vs17, o8, BO // load imag part from B
2541 xvmuldp vs32, vs0, vs16 // real*real, imag*real
2542 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
2543 xvmuldp vs34, vs1, vs16 // real*real, imag*real
2544 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
2545 xvmuldp vs36, vs2, vs16 // real*real, imag*real
2546 xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
2547 xvmuldp vs38, vs3, vs16 // real*real, imag*real
2548 xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
2553 .macro KERNEL1x4_SUB1
2555 lxvd2x vs0, o0, AO // load real,imag from A
2556 lxvd2x vs1, o16, AO // load real,imag from A
2557 lxvd2x vs2, o32, AO // load real,imag from A
2558 lxvd2x vs3, o48, AO // load real,imag from A
2562 lxvdsx vs16, o0, BO // load real part from B
2563 lxvdsx vs17, o8, BO // load imag part from B
2567 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2568 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2569 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
2570 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
2571 xvmaddadp vs36, vs2, vs16 // real*real, imag*real
2572 xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
2573 xvmaddadp vs38, vs3, vs16 // real*real, imag*real
2574 xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
2587 lxvd2x vs17, o16, T1
2588 lxvd2x vs18, o32, T1
2589 lxvd2x vs19, o48, T1
2594 xxlxor vs0, vs0, vs0
2595 xxlxor vs1, vs1, vs1
2596 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2598 XSFADD_R1 vs0, vs0, vs32 // realA*realB
2599 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
2601 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2602 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2604 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
2605 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
2607 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2608 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2609 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2610 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2612 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2613 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2614 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
2618 xxlxor vs0, vs0, vs0
2619 xxlxor vs1, vs1, vs1
2620 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2622 XSFADD_R1 vs0, vs0, vs34 // realA*realB
2623 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
2625 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2626 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2628 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
2629 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
2631 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2632 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2633 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2634 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2636 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2637 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2638 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
2642 xxlxor vs0, vs0, vs0
2643 xxlxor vs1, vs1, vs1
2644 xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2646 XSFADD_R1 vs0, vs0, vs36 // realA*realB
2647 XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
2649 xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2650 xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2652 XSFADD_I1 vs1, vs1, vs36 // realA*imagB
2653 XSFADD_I2 vs1, vs1, vs37 // imagA*realB
2655 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2656 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2657 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2658 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2660 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2661 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2662 xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
2666 xxlxor vs0, vs0, vs0
2667 xxlxor vs1, vs1, vs1
2668 xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2670 XSFADD_R1 vs0, vs0, vs38 // realA*realB
2671 XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
2673 xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2674 xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2676 XSFADD_I1 vs1, vs1, vs38 // realA*imagB
2677 XSFADD_I2 vs1, vs1, vs39 // imagA*realB
2679 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2680 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2681 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2682 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2684 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2685 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2686 xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
2691 xvadddp vs8, vs8, vs16
2692 xvadddp vs9, vs9, vs17
2693 xvadddp vs10, vs10, vs18
2694 xvadddp vs11, vs11, vs19
2699 stxvd2x vs9, o16, T1
2700 stxvd2x vs10, o32, T1
2701 stxvd2x vs11, o48, T1
2709 /**********************************************************************************************
2710 * Macros for N=1 and M=2
2711 **********************************************************************************************/
2715 lxvdsx vs16, o0, BO // load real part from B
2716 lxvdsx vs17, o8, BO // load imag part from B
2720 lxvd2x vs0, o0, AO // load real,imag from A
2721 lxvd2x vs1, o16, AO // load real,imag from A
2730 lxvd2x vs8, o0, AO // load real,imag from A
2731 lxvd2x vs9, o16, AO // load real,imag from A
2735 lxvdsx vs20, o0, BO // load real part from B
2736 lxvdsx vs21, o8, BO // load imag part from B
2740 xvmuldp vs32, vs0, vs16 // real*real, imag*real
2741 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
2742 xvmuldp vs34, vs1, vs16 // real*real, imag*real
2743 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
2750 lxvd2x vs8, o0, AO // load real,imag from A
2751 lxvd2x vs9, o16, AO // load real,imag from A
2755 lxvdsx vs20, o0, BO // load real part from B
2756 lxvdsx vs21, o8, BO // load imag part from B
2760 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2761 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2762 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
2763 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
2770 lxvd2x vs0, o0, AO // load real,imag from A
2771 lxvd2x vs1, o16, AO // load real,imag from A
2775 lxvdsx vs16, o0, BO // load real part from B
2776 lxvdsx vs17, o8, BO // load imag part from B
2780 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2781 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2782 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
2783 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
2791 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2792 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2793 xvmaddadp vs34, vs9, vs20 // real*real, imag*real
2794 xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
2799 .macro KERNEL1x2_SUBI1
2801 lxvd2x vs0, o0, AO // load real,imag from A
2802 lxvd2x vs1, o16, AO // load real,imag from A
2806 lxvdsx vs16, o0, BO // load real part from B
2807 lxvdsx vs17, o8, BO // load imag part from B
2811 xvmuldp vs32, vs0, vs16 // real*real, imag*real
2812 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
2813 xvmuldp vs34, vs1, vs16 // real*real, imag*real
2814 xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
2819 .macro KERNEL1x2_SUB1
2821 lxvd2x vs0, o0, AO // load real,imag from A
2822 lxvd2x vs1, o16, AO // load real,imag from A
2826 lxvdsx vs16, o0, BO // load real part from B
2827 lxvdsx vs17, o8, BO // load imag part from B
2831 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2832 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2833 xvmaddadp vs34, vs1, vs16 // real*real, imag*real
2834 xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
2847 lxvd2x vs17, o16, T1
2852 xxlxor vs0, vs0, vs0
2853 xxlxor vs1, vs1, vs1
2854 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2856 XSFADD_R1 vs0, vs0, vs32 // realA*realB
2857 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
2859 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2860 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2862 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
2863 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
2865 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2866 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2867 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2868 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2870 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2871 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2872 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
2876 xxlxor vs0, vs0, vs0
2877 xxlxor vs1, vs1, vs1
2878 xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2880 XSFADD_R1 vs0, vs0, vs34 // realA*realB
2881 XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
2883 xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB
2884 xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2886 XSFADD_I1 vs1, vs1, vs34 // realA*imagB
2887 XSFADD_I2 vs1, vs1, vs35 // imagA*realB
2889 xsmuldp vs4, vs0, alpha_r // real*alpha_r
2890 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
2891 xsmuldp vs6, vs0, alpha_i // real*alpha_i
2892 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
2894 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
2895 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
2896 xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
2901 xvadddp vs8, vs8, vs16
2902 xvadddp vs9, vs9, vs17
2907 stxvd2x vs9, o16, T1
2915 /**********************************************************************************************
2916 * Macros for N=1 and M=1
2917 **********************************************************************************************/
2921 lxvdsx vs16, o0, BO // load real part from B
2922 lxvdsx vs17, o8, BO // load imag part from B
2926 lxvd2x vs0, o0, AO // load real,imag from A
2935 lxvd2x vs8, o0, AO // load real,imag from A
2939 lxvdsx vs20, o0, BO // load real part from B
2940 lxvdsx vs21, o8, BO // load imag part from B
2944 xvmuldp vs32, vs0, vs16 // real*real, imag*real
2945 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
2952 lxvd2x vs8, o0, AO // load real,imag from A
2956 lxvdsx vs20, o0, BO // load real part from B
2957 lxvdsx vs21, o8, BO // load imag part from B
2961 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
2962 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
2969 lxvd2x vs0, o0, AO // load real,imag from A
2973 lxvdsx vs16, o0, BO // load real part from B
2974 lxvdsx vs17, o8, BO // load imag part from B
2978 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2979 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2987 xvmaddadp vs32, vs8, vs20 // real*real, imag*real
2988 xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
2993 .macro KERNEL1x1_SUBI1
2995 lxvd2x vs0, o0, AO // load real,imag from A
2999 lxvdsx vs16, o0, BO // load real part from B
3000 lxvdsx vs17, o8, BO // load imag part from B
3004 xvmuldp vs32, vs0, vs16 // real*real, imag*real
3005 xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
3010 .macro KERNEL1x1_SUB1
3012 lxvd2x vs0, o0, AO // load real,imag from A
3016 lxvdsx vs16, o0, BO // load real part from B
3017 lxvdsx vs17, o8, BO // load imag part from B
3021 xvmaddadp vs32, vs0, vs16 // real*real, imag*real
3022 xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
3039 xxlxor vs0, vs0, vs0
3040 xxlxor vs1, vs1, vs1
3041 xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3043 XSFADD_R1 vs0, vs0, vs32 // realA*realB
3044 XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
3046 xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB
3047 xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3049 XSFADD_I1 vs1, vs1, vs32 // realA*imagB
3050 XSFADD_I2 vs1, vs1, vs33 // imagA*realB
3052 xsmuldp vs4, vs0, alpha_r // real*alpha_r
3053 xsmuldp vs5, vs1, alpha_i // imag*alpha_i
3054 xsmuldp vs6, vs0, alpha_i // real*alpha_i
3055 xsmuldp vs7, vs1, alpha_r // imag*alpha_r
3057 xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
3058 xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
3059 xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
3064 xvadddp vs8, vs8, vs16