added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8
[platform/upstream/openblas.git] / kernel / power / zgemm_macros_8x2_power8.S
1 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
2
3         #define XSFADD_R1       xsadddp
4         #define XSFADD_R2       xssubdp
5         #define XSFADD_I1       xsadddp
6         #define XSFADD_I2       xsadddp
7
8 #elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
9
10         #define XSFADD_R1       xsadddp
11         #define XSFADD_R2       xsadddp
12         #define XSFADD_I1       xssubdp
13         #define XSFADD_I2       xsadddp
14
15 #elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
16
17         #define XSFADD_R1       xsadddp
18         #define XSFADD_R2       xsadddp
19         #define XSFADD_I1       xsadddp
20         #define XSFADD_I2       xssubdp
21
22 #else           // CC || CR || RC || RR
23
24         #define XSFADD_R1       xsadddp
25         #define XSFADD_R2       xssubdp
26         #define XSFADD_I1       xssubdp
27         #define XSFADD_I2       xssubdp
28
29 #endif
30
31 /**********************************************************************************************
32 * Macros for N=2 and M=8
33 **********************************************************************************************/
34
35 .macro LOAD2x8_1
36
37         lxvdsx          vs16,   o0,     BO              // load real part from B
38         lxvdsx          vs17,   o8,     BO              // load imag part from B
39         lxvdsx          vs18,   o16,    BO              // load real part from B
40         lxvdsx          vs19,   o24,    BO              // load imag part from B
41
42         addi            BO,     BO,     32
43
44         lxvd2x          vs0,    o0,     AO              // load real,imag from A
45         lxvd2x          vs1,    o16,    AO              // load real,imag from A
46         lxvd2x          vs2,    o32,    AO              // load real,imag from A
47         lxvd2x          vs3,    o48,    AO              // load real,imag from A
48
49         addi            AO,     AO,     64
50
51         lxvd2x          vs4,    o0,     AO              // load real,imag from A
52         lxvd2x          vs5,    o16,    AO              // load real,imag from A
53         lxvd2x          vs6,    o32,    AO              // load real,imag from A
54         lxvd2x          vs7,    o48,    AO              // load real,imag from A
55
56         addi            AO,     AO,     64
57
58
59 .endm
60
61 .macro KERNEL2x8_I1
62
63         lxvd2x          vs8,    o0,     AO              // load real,imag from A
64         lxvd2x          vs9,    o16,    AO              // load real,imag from A
65         lxvd2x          vs10,   o32,    AO              // load real,imag from A
66         lxvd2x          vs11,   o48,    AO              // load real,imag from A
67
68         addi            AO,     AO,     64
69
70         lxvd2x          vs12,   o0,     AO              // load real,imag from A
71         lxvd2x          vs13,   o16,    AO              // load real,imag from A
72         lxvd2x          vs14,   o32,    AO              // load real,imag from A
73         lxvd2x          vs15,   o48,    AO              // load real,imag from A
74
75         addi            AO,     AO,     64
76
77         lxvdsx          vs20,   o0,     BO              // load real part from B
78         lxvdsx          vs21,   o8,     BO              // load imag part from B
79         lxvdsx          vs22,   o16,    BO              // load real part from B
80         lxvdsx          vs23,   o24,    BO              // load imag part from B
81
82         addi            BO,     BO,     32
83
84         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
85         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
86         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
87         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
88         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
89         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
90         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
91         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
92         xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
93         xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
94         xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
95         xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
96         xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
97         xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
98         xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
99         xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
100
101         xvmuldp         vs48,   vs0,    vs18            // real*real, imag*real
102         xvmuldp         vs49,   vs0,    vs19            // real*imag, imag*imag
103         xvmuldp         vs50,   vs1,    vs18            // real*real, imag*real
104         xvmuldp         vs51,   vs1,    vs19            // real*imag, imag*imag
105         xvmuldp         vs52,   vs2,    vs18            // real*real, imag*real
106         xvmuldp         vs53,   vs2,    vs19            // real*imag, imag*imag
107         xvmuldp         vs54,   vs3,    vs18            // real*real, imag*real
108         xvmuldp         vs55,   vs3,    vs19            // real*imag, imag*imag
109         xvmuldp         vs56,   vs4,    vs18            // real*real, imag*real
110         xvmuldp         vs57,   vs4,    vs19            // real*imag, imag*imag
111         xvmuldp         vs58,   vs5,    vs18            // real*real, imag*real
112         xvmuldp         vs59,   vs5,    vs19            // real*imag, imag*imag
113         xvmuldp         vs60,   vs6,    vs18            // real*real, imag*real
114         xvmuldp         vs61,   vs6,    vs19            // real*imag, imag*imag
115         xvmuldp         vs62,   vs7,    vs18            // real*real, imag*real
116         xvmuldp         vs63,   vs7,    vs19            // real*imag, imag*imag
117
118
119 .endm
120
121 .macro KERNEL2x8_1
122
123
124         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
125         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
126         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
127         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
128
129         lxvdsx          vs22,   o16,    BO              // load real part from B
130         lxvdsx          vs23,   o24,    BO              // load imag part from B
131
132         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
133         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
134         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
135         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
136
137         lxvd2x          vs8,    o0,     AO              // load real,imag from A
138         lxvd2x          vs9,    o16,    AO              // load real,imag from A
139
140         xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
141         xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
142         xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
143         xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
144
145         lxvd2x          vs10,   o32,    AO              // load real,imag from A
146         lxvd2x          vs11,   o48,    AO              // load real,imag from A
147
148         xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
149         xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
150
151         addi            AO,     AO,     64
152
153         xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
154         xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
155
156         xvmaddadp       vs48,   vs0,    vs18            // real*real, imag*real
157         xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
158         xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
159         xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
160
161         lxvd2x          vs12,   o0,     AO              // load real,imag from A
162         lxvd2x          vs13,   o16,    AO              // load real,imag from A
163
164         xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
165         xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
166         xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
167         xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
168
169         lxvd2x          vs14,   o32,    AO              // load real,imag from A
170         lxvd2x          vs15,   o48,    AO              // load real,imag from A
171
172         xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
173         xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
174         xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
175         xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
176
177         lxvdsx          vs20,   o0,     BO              // load real part from B
178         lxvdsx          vs21,   o8,     BO              // load imag part from B
179
180         xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
181         xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
182         xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
183         xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
184
185         addi            AO,     AO,     64
186         addi            BO,     BO,     32
187
188 .endm
189
190 .macro KERNEL2x8_2
191
192
193         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
194         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
195         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
196         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
197
198         lxvdsx          vs16,   o0,     BO              // load real part from B
199         lxvdsx          vs17,   o8,     BO              // load imag part from B
200
201         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
202         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
203         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
204         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
205
206         lxvd2x          vs0,    o0,     AO              // load real,imag from A
207         lxvd2x          vs1,    o16,    AO              // load real,imag from A
208
209         xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
210         xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
211         xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
212         xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
213
214         lxvd2x          vs2,    o32,    AO              // load real,imag from A
215         lxvd2x          vs3,    o48,    AO              // load real,imag from A
216
217         xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
218         xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
219         xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
220         xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
221
222         addi            AO,     AO,     64
223
224         xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
225         xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
226         xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
227         xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
228
229         lxvd2x          vs4,    o0,     AO              // load real,imag from A
230         lxvd2x          vs5,    o16,    AO              // load real,imag from A
231
232         xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
233         xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
234         xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
235         xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
236
237         lxvd2x          vs6,    o32,    AO              // load real,imag from A
238         lxvd2x          vs7,    o48,    AO              // load real,imag from A
239
240         xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
241         xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
242         xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
243         xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
244
245         lxvdsx          vs18,   o16,    BO              // load real part from B
246         lxvdsx          vs19,   o24,    BO              // load imag part from B
247
248         xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
249         xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
250         xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
251         xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
252
253         addi            AO,     AO,     64
254         addi            BO,     BO,     32
255
256 .endm
257
258 .macro KERNEL2x8_E2
259
260
261         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
262         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
263         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
264         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
265         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
266         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
267         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
268         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
269         xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
270         xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
271         xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
272         xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
273         xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
274         xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
275         xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
276         xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
277
278         xvmaddadp       vs48,   vs8,    vs22            // real*real, imag*real
279         xvmaddadp       vs49,   vs8,    vs23            // real*imag, imag*imag
280         xvmaddadp       vs50,   vs9,    vs22            // real*real, imag*real
281         xvmaddadp       vs51,   vs9,    vs23            // real*imag, imag*imag
282         xvmaddadp       vs52,   vs10,   vs22            // real*real, imag*real
283         xvmaddadp       vs53,   vs10,   vs23            // real*imag, imag*imag
284         xvmaddadp       vs54,   vs11,   vs22            // real*real, imag*real
285         xvmaddadp       vs55,   vs11,   vs23            // real*imag, imag*imag
286         xvmaddadp       vs56,   vs12,   vs22            // real*real, imag*real
287         xvmaddadp       vs57,   vs12,   vs23            // real*imag, imag*imag
288         xvmaddadp       vs58,   vs13,   vs22            // real*real, imag*real
289         xvmaddadp       vs59,   vs13,   vs23            // real*imag, imag*imag
290         xvmaddadp       vs60,   vs14,   vs22            // real*real, imag*real
291         xvmaddadp       vs61,   vs14,   vs23            // real*imag, imag*imag
292         xvmaddadp       vs62,   vs15,   vs22            // real*real, imag*real
293         xvmaddadp       vs63,   vs15,   vs23            // real*imag, imag*imag
294
295
296 .endm
297
298 .macro KERNEL2x8_SUBI1
299
300         lxvd2x          vs0,    o0,     AO              // load real,imag from A
301         lxvd2x          vs1,    o16,    AO              // load real,imag from A
302         lxvd2x          vs2,    o32,    AO              // load real,imag from A
303         lxvd2x          vs3,    o48,    AO              // load real,imag from A
304
305         addi            AO,     AO,     64
306
307         lxvd2x          vs4,    o0,     AO              // load real,imag from A
308         lxvd2x          vs5,    o16,    AO              // load real,imag from A
309         lxvd2x          vs6,    o32,    AO              // load real,imag from A
310         lxvd2x          vs7,    o48,    AO              // load real,imag from A
311
312         addi            AO,     AO,     64
313
314         lxvdsx          vs16,   o0,     BO              // load real part from B
315         lxvdsx          vs17,   o8,     BO              // load imag part from B
316         lxvdsx          vs18,   o16,    BO              // load real part from B
317         lxvdsx          vs19,   o24,    BO              // load imag part from B
318
319         addi            BO,     BO,     32
320
321         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
322         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
323         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
324         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
325         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
326         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
327         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
328         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
329         xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
330         xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
331         xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
332         xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
333         xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
334         xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
335         xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
336         xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
337
338         xvmuldp         vs48,   vs0,    vs18            // real*real, imag*real
339         xvmuldp         vs49,   vs0,    vs19            // real*imag, imag*imag
340         xvmuldp         vs50,   vs1,    vs18            // real*real, imag*real
341         xvmuldp         vs51,   vs1,    vs19            // real*imag, imag*imag
342         xvmuldp         vs52,   vs2,    vs18            // real*real, imag*real
343         xvmuldp         vs53,   vs2,    vs19            // real*imag, imag*imag
344         xvmuldp         vs54,   vs3,    vs18            // real*real, imag*real
345         xvmuldp         vs55,   vs3,    vs19            // real*imag, imag*imag
346         xvmuldp         vs56,   vs4,    vs18            // real*real, imag*real
347         xvmuldp         vs57,   vs4,    vs19            // real*imag, imag*imag
348         xvmuldp         vs58,   vs5,    vs18            // real*real, imag*real
349         xvmuldp         vs59,   vs5,    vs19            // real*imag, imag*imag
350         xvmuldp         vs60,   vs6,    vs18            // real*real, imag*real
351         xvmuldp         vs61,   vs6,    vs19            // real*imag, imag*imag
352         xvmuldp         vs62,   vs7,    vs18            // real*real, imag*real
353         xvmuldp         vs63,   vs7,    vs19            // real*imag, imag*imag
354
355
356 .endm
357
358 .macro KERNEL2x8_SUB1
359
360         lxvd2x          vs0,    o0,     AO              // load real,imag from A
361         lxvd2x          vs1,    o16,    AO              // load real,imag from A
362         lxvd2x          vs2,    o32,    AO              // load real,imag from A
363         lxvd2x          vs3,    o48,    AO              // load real,imag from A
364
365         addi            AO,     AO,     64
366
367         lxvd2x          vs4,    o0,     AO              // load real,imag from A
368         lxvd2x          vs5,    o16,    AO              // load real,imag from A
369         lxvd2x          vs6,    o32,    AO              // load real,imag from A
370         lxvd2x          vs7,    o48,    AO              // load real,imag from A
371
372         addi            AO,     AO,     64
373
374         lxvdsx          vs16,   o0,     BO              // load real part from B
375         lxvdsx          vs17,   o8,     BO              // load imag part from B
376         lxvdsx          vs18,   o16,    BO              // load real part from B
377         lxvdsx          vs19,   o24,    BO              // load imag part from B
378
379         addi            BO,     BO,     32
380
381         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
382         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
383         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
384         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
385         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
386         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
387         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
388         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
389         xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
390         xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
391         xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
392         xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
393         xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
394         xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
395         xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
396         xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
397
398         xvmaddadp       vs48,   vs0,    vs18            // real*real, imag*real
399         xvmaddadp       vs49,   vs0,    vs19            // real*imag, imag*imag
400         xvmaddadp       vs50,   vs1,    vs18            // real*real, imag*real
401         xvmaddadp       vs51,   vs1,    vs19            // real*imag, imag*imag
402         xvmaddadp       vs52,   vs2,    vs18            // real*real, imag*real
403         xvmaddadp       vs53,   vs2,    vs19            // real*imag, imag*imag
404         xvmaddadp       vs54,   vs3,    vs18            // real*real, imag*real
405         xvmaddadp       vs55,   vs3,    vs19            // real*imag, imag*imag
406         xvmaddadp       vs56,   vs4,    vs18            // real*real, imag*real
407         xvmaddadp       vs57,   vs4,    vs19            // real*imag, imag*imag
408         xvmaddadp       vs58,   vs5,    vs18            // real*real, imag*real
409         xvmaddadp       vs59,   vs5,    vs19            // real*imag, imag*imag
410         xvmaddadp       vs60,   vs6,    vs18            // real*real, imag*real
411         xvmaddadp       vs61,   vs6,    vs19            // real*imag, imag*imag
412         xvmaddadp       vs62,   vs7,    vs18            // real*real, imag*real
413         xvmaddadp       vs63,   vs7,    vs19            // real*imag, imag*imag
414
415
416 .endm
417
418 .macro SAVE2x8
419
420
421         mr              T1,     CO
422         addi            T2,     T1,     64
423
424 #ifndef TRMMKERNEL
425
426         lxvd2x          vs16,   o0,     T1
427         lxvd2x          vs17,   o16,    T1
428         lxvd2x          vs18,   o32,    T1
429         lxvd2x          vs19,   o48,    T1
430         lxvd2x          vs20,   o0,     T2
431         lxvd2x          vs21,   o16,    T2
432         lxvd2x          vs22,   o32,    T2
433         lxvd2x          vs23,   o48,    T2
434
435 #endif
436
437
438         xxlxor          vs0,    vs0,    vs0
439         xxlxor          vs1,    vs1,    vs1
440         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
441
442         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
443         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
444
445         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
446         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
447
448         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
449         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
450
451         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
452         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
453         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
454         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
455
456         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
457         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
458         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
459
460
461
462         xxlxor          vs0,    vs0,    vs0
463         xxlxor          vs1,    vs1,    vs1
464         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
465
466         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
467         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
468
469         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
470         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
471
472         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
473         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
474
475         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
476         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
477         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
478         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
479
480         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
481         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
482         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
483
484
485
486         xxlxor          vs0,    vs0,    vs0
487         xxlxor          vs1,    vs1,    vs1
488         xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
489
490         XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
491         XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
492
493         xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
494         xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
495
496         XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
497         XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
498
499         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
500         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
501         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
502         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
503
504         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
505         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
506         xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
507
508
509
510         xxlxor          vs0,    vs0,    vs0
511         xxlxor          vs1,    vs1,    vs1
512         xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
513
514         XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
515         XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
516
517         xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
518         xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
519
520         XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
521         XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
522
523         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
524         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
525         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
526         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
527
528         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
529         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
530         xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
531
532
533
534         xxlxor          vs0,    vs0,    vs0
535         xxlxor          vs1,    vs1,    vs1
536         xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
537
538         XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
539         XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
540
541         xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
542         xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
543
544         XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
545         XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
546
547         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
548         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
549         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
550         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
551
552         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
553         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
554         xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
555
556
557
558         xxlxor          vs0,    vs0,    vs0
559         xxlxor          vs1,    vs1,    vs1
560         xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
561
562         XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
563         XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
564
565         xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
566         xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
567
568         XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
569         XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
570
571         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
572         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
573         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
574         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
575
576         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
577         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
578         xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
579
580
581
582         xxlxor          vs0,    vs0,    vs0
583         xxlxor          vs1,    vs1,    vs1
584         xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
585
586         XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
587         XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
588
589         xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
590         xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
591
592         XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
593         XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
594
595         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
596         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
597         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
598         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
599
600         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
601         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
602         xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
603
604
605
606         xxlxor          vs0,    vs0,    vs0
607         xxlxor          vs1,    vs1,    vs1
608         xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
609
610         XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
611         XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
612
613         xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
614         xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
615
616         XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
617         XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
618
619         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
620         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
621         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
622         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
623
624         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
625         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
626         xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
627
628
629 #ifndef TRMMKERNEL
630
631         xvadddp         vs8,    vs8,    vs16
632         xvadddp         vs9,    vs9,    vs17
633         xvadddp         vs10,   vs10,   vs18
634         xvadddp         vs11,   vs11,   vs19
635         xvadddp         vs12,   vs12,   vs20
636         xvadddp         vs13,   vs13,   vs21
637         xvadddp         vs14,   vs14,   vs22
638         xvadddp         vs15,   vs15,   vs23
639
640 #endif
641
642         stxvd2x         vs8,    o0,     T1
643         stxvd2x         vs9,    o16,    T1
644         stxvd2x         vs10,   o32,    T1
645         stxvd2x         vs11,   o48,    T1
646         stxvd2x         vs12,   o0,     T2
647         stxvd2x         vs13,   o16,    T2
648         stxvd2x         vs14,   o32,    T2
649         stxvd2x         vs15,   o48,    T2
650
651         add             T1,     T1,     LDC
652         add             T2,     T2,     LDC
653
654 #ifndef TRMMKERNEL
655
656         lxvd2x          vs16,   o0,     T1
657         lxvd2x          vs17,   o16,    T1
658         lxvd2x          vs18,   o32,    T1
659         lxvd2x          vs19,   o48,    T1
660         lxvd2x          vs20,   o0,     T2
661         lxvd2x          vs21,   o16,    T2
662         lxvd2x          vs22,   o32,    T2
663         lxvd2x          vs23,   o48,    T2
664
665 #endif
666
667
668         xxlxor          vs0,    vs0,    vs0
669         xxlxor          vs1,    vs1,    vs1
670         xxswapd         vs49,   vs49                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
671
672         XSFADD_R1       vs0,    vs0,    vs48            // realA*realB
673         XSFADD_R2       vs0,    vs0,    vs49            // imagA*imagB
674
675         xxswapd         vs48,   vs48                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
676         xxswapd         vs49,   vs49                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
677
678         XSFADD_I1       vs1,    vs1,    vs48            // realA*imagB
679         XSFADD_I2       vs1,    vs1,    vs49            // imagA*realB
680
681         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
682         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
683         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
684         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
685
686         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
687         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
688         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
689
690
691
692         xxlxor          vs0,    vs0,    vs0
693         xxlxor          vs1,    vs1,    vs1
694         xxswapd         vs51,   vs51                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
695
696         XSFADD_R1       vs0,    vs0,    vs50            // realA*realB
697         XSFADD_R2       vs0,    vs0,    vs51            // imagA*imagB
698
699         xxswapd         vs50,   vs50                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
700         xxswapd         vs51,   vs51                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
701
702         XSFADD_I1       vs1,    vs1,    vs50            // realA*imagB
703         XSFADD_I2       vs1,    vs1,    vs51            // imagA*realB
704
705         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
706         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
707         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
708         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
709
710         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
711         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
712         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
713
714
715
716         xxlxor          vs0,    vs0,    vs0
717         xxlxor          vs1,    vs1,    vs1
718         xxswapd         vs53,   vs53                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
719
720         XSFADD_R1       vs0,    vs0,    vs52            // realA*realB
721         XSFADD_R2       vs0,    vs0,    vs53            // imagA*imagB
722
723         xxswapd         vs52,   vs52                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
724         xxswapd         vs53,   vs53                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
725
726         XSFADD_I1       vs1,    vs1,    vs52            // realA*imagB
727         XSFADD_I2       vs1,    vs1,    vs53            // imagA*realB
728
729         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
730         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
731         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
732         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
733
734         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
735         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
736         xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
737
738
739
740         xxlxor          vs0,    vs0,    vs0
741         xxlxor          vs1,    vs1,    vs1
742         xxswapd         vs55,   vs55                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
743
744         XSFADD_R1       vs0,    vs0,    vs54            // realA*realB
745         XSFADD_R2       vs0,    vs0,    vs55            // imagA*imagB
746
747         xxswapd         vs54,   vs54                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
748         xxswapd         vs55,   vs55                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
749
750         XSFADD_I1       vs1,    vs1,    vs54            // realA*imagB
751         XSFADD_I2       vs1,    vs1,    vs55            // imagA*realB
752
753         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
754         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
755         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
756         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
757
758         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
759         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
760         xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
761
762
763
764         xxlxor          vs0,    vs0,    vs0
765         xxlxor          vs1,    vs1,    vs1
766         xxswapd         vs57,   vs57                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
767
768         XSFADD_R1       vs0,    vs0,    vs56            // realA*realB
769         XSFADD_R2       vs0,    vs0,    vs57            // imagA*imagB
770
771         xxswapd         vs56,   vs56                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
772         xxswapd         vs57,   vs57                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
773
774         XSFADD_I1       vs1,    vs1,    vs56            // realA*imagB
775         XSFADD_I2       vs1,    vs1,    vs57            // imagA*realB
776
777         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
778         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
779         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
780         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
781
782         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
783         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
784         xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
785
786
787
788         xxlxor          vs0,    vs0,    vs0
789         xxlxor          vs1,    vs1,    vs1
790         xxswapd         vs59,   vs59                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
791
792         XSFADD_R1       vs0,    vs0,    vs58            // realA*realB
793         XSFADD_R2       vs0,    vs0,    vs59            // imagA*imagB
794
795         xxswapd         vs58,   vs58                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
796         xxswapd         vs59,   vs59                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
797
798         XSFADD_I1       vs1,    vs1,    vs58            // realA*imagB
799         XSFADD_I2       vs1,    vs1,    vs59            // imagA*realB
800
801         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
802         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
803         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
804         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
805
806         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
807         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
808         xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
809
810
811
812         xxlxor          vs0,    vs0,    vs0
813         xxlxor          vs1,    vs1,    vs1
814         xxswapd         vs61,   vs61                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
815
816         XSFADD_R1       vs0,    vs0,    vs60            // realA*realB
817         XSFADD_R2       vs0,    vs0,    vs61            // imagA*imagB
818
819         xxswapd         vs60,   vs60                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
820         xxswapd         vs61,   vs61                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
821
822         XSFADD_I1       vs1,    vs1,    vs60            // realA*imagB
823         XSFADD_I2       vs1,    vs1,    vs61            // imagA*realB
824
825         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
826         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
827         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
828         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
829
830         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
831         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
832         xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
833
834
835
836         xxlxor          vs0,    vs0,    vs0
837         xxlxor          vs1,    vs1,    vs1
838         xxswapd         vs63,   vs63                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
839
840         XSFADD_R1       vs0,    vs0,    vs62            // realA*realB
841         XSFADD_R2       vs0,    vs0,    vs63            // imagA*imagB
842
843         xxswapd         vs62,   vs62                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
844         xxswapd         vs63,   vs63                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
845
846         XSFADD_I1       vs1,    vs1,    vs62            // realA*imagB
847         XSFADD_I2       vs1,    vs1,    vs63            // imagA*realB
848
849         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
850         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
851         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
852         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
853
854         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
855         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
856         xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
857
858
859 #ifndef TRMMKERNEL
860
861         xvadddp         vs8,    vs8,    vs16
862         xvadddp         vs9,    vs9,    vs17
863         xvadddp         vs10,   vs10,   vs18
864         xvadddp         vs11,   vs11,   vs19
865         xvadddp         vs12,   vs12,   vs20
866         xvadddp         vs13,   vs13,   vs21
867         xvadddp         vs14,   vs14,   vs22
868         xvadddp         vs15,   vs15,   vs23
869
870 #endif
871
872         stxvd2x         vs8,    o0,     T1
873         stxvd2x         vs9,    o16,    T1
874         stxvd2x         vs10,   o32,    T1
875         stxvd2x         vs11,   o48,    T1
876         stxvd2x         vs12,   o0,     T2
877         stxvd2x         vs13,   o16,    T2
878         stxvd2x         vs14,   o32,    T2
879         stxvd2x         vs15,   o48,    T2
880
881         add             T1,     T1,     LDC
882         add             T2,     T2,     LDC
883         addi            CO,     CO,     128
884
885 .endm
886
887
888 /**********************************************************************************************
889 * Macros for N=2 and M=4
890 **********************************************************************************************/
891
892 .macro LOAD2x4_1
893
894         lxvdsx          vs16,   o0,     BO              // load real part from B
895         lxvdsx          vs17,   o8,     BO              // load imag part from B
896         lxvdsx          vs18,   o16,    BO              // load real part from B
897         lxvdsx          vs19,   o24,    BO              // load imag part from B
898
899         addi            BO,     BO,     32
900
901         lxvd2x          vs0,    o0,     AO              // load real,imag from A
902         lxvd2x          vs1,    o16,    AO              // load real,imag from A
903         lxvd2x          vs2,    o32,    AO              // load real,imag from A
904         lxvd2x          vs3,    o48,    AO              // load real,imag from A
905
906         addi            AO,     AO,     64
907
908
909 .endm
910
911 .macro KERNEL2x4_I1
912
913         lxvd2x          vs8,    o0,     AO              // load real,imag from A
914         lxvd2x          vs9,    o16,    AO              // load real,imag from A
915         lxvd2x          vs10,   o32,    AO              // load real,imag from A
916         lxvd2x          vs11,   o48,    AO              // load real,imag from A
917
918         addi            AO,     AO,     64
919
920         lxvdsx          vs20,   o0,     BO              // load real part from B
921         lxvdsx          vs21,   o8,     BO              // load imag part from B
922         lxvdsx          vs22,   o16,    BO              // load real part from B
923         lxvdsx          vs23,   o24,    BO              // load imag part from B
924
925         addi            BO,     BO,     32
926
927         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
928         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
929         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
930         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
931         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
932         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
933         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
934         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
935
936         xvmuldp         vs40,   vs0,    vs18            // real*real, imag*real
937         xvmuldp         vs41,   vs0,    vs19            // real*imag, imag*imag
938         xvmuldp         vs42,   vs1,    vs18            // real*real, imag*real
939         xvmuldp         vs43,   vs1,    vs19            // real*imag, imag*imag
940         xvmuldp         vs44,   vs2,    vs18            // real*real, imag*real
941         xvmuldp         vs45,   vs2,    vs19            // real*imag, imag*imag
942         xvmuldp         vs46,   vs3,    vs18            // real*real, imag*real
943         xvmuldp         vs47,   vs3,    vs19            // real*imag, imag*imag
944
945
946 .endm
947
948 .macro KERNEL2x4_1
949
950         lxvd2x          vs8,    o0,     AO              // load real,imag from A
951         lxvd2x          vs9,    o16,    AO              // load real,imag from A
952         lxvd2x          vs10,   o32,    AO              // load real,imag from A
953         lxvd2x          vs11,   o48,    AO              // load real,imag from A
954
955         addi            AO,     AO,     64
956
957         lxvdsx          vs20,   o0,     BO              // load real part from B
958         lxvdsx          vs21,   o8,     BO              // load imag part from B
959         lxvdsx          vs22,   o16,    BO              // load real part from B
960         lxvdsx          vs23,   o24,    BO              // load imag part from B
961
962         addi            BO,     BO,     32
963
964         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
965         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
966         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
967         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
968         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
969         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
970         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
971         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
972
973         xvmaddadp       vs40,   vs0,    vs18            // real*real, imag*real
974         xvmaddadp       vs41,   vs0,    vs19            // real*imag, imag*imag
975         xvmaddadp       vs42,   vs1,    vs18            // real*real, imag*real
976         xvmaddadp       vs43,   vs1,    vs19            // real*imag, imag*imag
977         xvmaddadp       vs44,   vs2,    vs18            // real*real, imag*real
978         xvmaddadp       vs45,   vs2,    vs19            // real*imag, imag*imag
979         xvmaddadp       vs46,   vs3,    vs18            // real*real, imag*real
980         xvmaddadp       vs47,   vs3,    vs19            // real*imag, imag*imag
981
982
983 .endm
984
985 .macro KERNEL2x4_2
986
987         lxvd2x          vs0,    o0,     AO              // load real,imag from A
988         lxvd2x          vs1,    o16,    AO              // load real,imag from A
989         lxvd2x          vs2,    o32,    AO              // load real,imag from A
990         lxvd2x          vs3,    o48,    AO              // load real,imag from A
991
992         addi            AO,     AO,     64
993
994         lxvdsx          vs16,   o0,     BO              // load real part from B
995         lxvdsx          vs17,   o8,     BO              // load imag part from B
996         lxvdsx          vs18,   o16,    BO              // load real part from B
997         lxvdsx          vs19,   o24,    BO              // load imag part from B
998
999         addi            BO,     BO,     32
1000
1001         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
1002         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
1003         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
1004         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
1005         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
1006         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
1007         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
1008         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
1009
1010         xvmaddadp       vs40,   vs8,    vs22            // real*real, imag*real
1011         xvmaddadp       vs41,   vs8,    vs23            // real*imag, imag*imag
1012         xvmaddadp       vs42,   vs9,    vs22            // real*real, imag*real
1013         xvmaddadp       vs43,   vs9,    vs23            // real*imag, imag*imag
1014         xvmaddadp       vs44,   vs10,   vs22            // real*real, imag*real
1015         xvmaddadp       vs45,   vs10,   vs23            // real*imag, imag*imag
1016         xvmaddadp       vs46,   vs11,   vs22            // real*real, imag*real
1017         xvmaddadp       vs47,   vs11,   vs23            // real*imag, imag*imag
1018
1019
1020 .endm
1021
1022 .macro KERNEL2x4_E2
1023
1024
1025         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
1026         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
1027         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
1028         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
1029         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
1030         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
1031         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
1032         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
1033
1034         xvmaddadp       vs40,   vs8,    vs22            // real*real, imag*real
1035         xvmaddadp       vs41,   vs8,    vs23            // real*imag, imag*imag
1036         xvmaddadp       vs42,   vs9,    vs22            // real*real, imag*real
1037         xvmaddadp       vs43,   vs9,    vs23            // real*imag, imag*imag
1038         xvmaddadp       vs44,   vs10,   vs22            // real*real, imag*real
1039         xvmaddadp       vs45,   vs10,   vs23            // real*imag, imag*imag
1040         xvmaddadp       vs46,   vs11,   vs22            // real*real, imag*real
1041         xvmaddadp       vs47,   vs11,   vs23            // real*imag, imag*imag
1042
1043
1044 .endm
1045
1046 .macro KERNEL2x4_SUBI1
1047
1048         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1049         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1050         lxvd2x          vs2,    o32,    AO              // load real,imag from A
1051         lxvd2x          vs3,    o48,    AO              // load real,imag from A
1052
1053         addi            AO,     AO,     64
1054
1055         lxvdsx          vs16,   o0,     BO              // load real part from B
1056         lxvdsx          vs17,   o8,     BO              // load imag part from B
1057         lxvdsx          vs18,   o16,    BO              // load real part from B
1058         lxvdsx          vs19,   o24,    BO              // load imag part from B
1059
1060         addi            BO,     BO,     32
1061
1062         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
1063         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
1064         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
1065         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
1066         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
1067         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
1068         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
1069         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
1070
1071         xvmuldp         vs40,   vs0,    vs18            // real*real, imag*real
1072         xvmuldp         vs41,   vs0,    vs19            // real*imag, imag*imag
1073         xvmuldp         vs42,   vs1,    vs18            // real*real, imag*real
1074         xvmuldp         vs43,   vs1,    vs19            // real*imag, imag*imag
1075         xvmuldp         vs44,   vs2,    vs18            // real*real, imag*real
1076         xvmuldp         vs45,   vs2,    vs19            // real*imag, imag*imag
1077         xvmuldp         vs46,   vs3,    vs18            // real*real, imag*real
1078         xvmuldp         vs47,   vs3,    vs19            // real*imag, imag*imag
1079
1080
1081 .endm
1082
1083 .macro KERNEL2x4_SUB1
1084
1085         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1086         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1087         lxvd2x          vs2,    o32,    AO              // load real,imag from A
1088         lxvd2x          vs3,    o48,    AO              // load real,imag from A
1089
1090         addi            AO,     AO,     64
1091
1092         lxvdsx          vs16,   o0,     BO              // load real part from B
1093         lxvdsx          vs17,   o8,     BO              // load imag part from B
1094         lxvdsx          vs18,   o16,    BO              // load real part from B
1095         lxvdsx          vs19,   o24,    BO              // load imag part from B
1096
1097         addi            BO,     BO,     32
1098
1099         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
1100         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
1101         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
1102         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
1103         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
1104         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
1105         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
1106         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
1107
1108         xvmaddadp       vs40,   vs0,    vs18            // real*real, imag*real
1109         xvmaddadp       vs41,   vs0,    vs19            // real*imag, imag*imag
1110         xvmaddadp       vs42,   vs1,    vs18            // real*real, imag*real
1111         xvmaddadp       vs43,   vs1,    vs19            // real*imag, imag*imag
1112         xvmaddadp       vs44,   vs2,    vs18            // real*real, imag*real
1113         xvmaddadp       vs45,   vs2,    vs19            // real*imag, imag*imag
1114         xvmaddadp       vs46,   vs3,    vs18            // real*real, imag*real
1115         xvmaddadp       vs47,   vs3,    vs19            // real*imag, imag*imag
1116
1117
1118 .endm
1119
1120 .macro SAVE2x4
1121
1122
1123         mr              T1,     CO
1124
1125 #ifndef TRMMKERNEL
1126
1127         lxvd2x          vs16,   o0,     T1
1128         lxvd2x          vs17,   o16,    T1
1129         lxvd2x          vs18,   o32,    T1
1130         lxvd2x          vs19,   o48,    T1
1131
1132 #endif
1133
1134
1135         xxlxor          vs0,    vs0,    vs0
1136         xxlxor          vs1,    vs1,    vs1
1137         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1138
1139         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
1140         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
1141
1142         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1143         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1144
1145         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
1146         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
1147
1148         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1149         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1150         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1151         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1152
1153         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1154         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1155         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
1156
1157
1158
1159         xxlxor          vs0,    vs0,    vs0
1160         xxlxor          vs1,    vs1,    vs1
1161         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1162
1163         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
1164         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
1165
1166         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1167         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1168
1169         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
1170         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
1171
1172         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1173         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1174         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1175         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1176
1177         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1178         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1179         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
1180
1181
1182
1183         xxlxor          vs0,    vs0,    vs0
1184         xxlxor          vs1,    vs1,    vs1
1185         xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1186
1187         XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
1188         XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
1189
1190         xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1191         xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1192
1193         XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
1194         XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
1195
1196         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1197         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1198         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1199         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1200
1201         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1202         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1203         xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
1204
1205
1206
1207         xxlxor          vs0,    vs0,    vs0
1208         xxlxor          vs1,    vs1,    vs1
1209         xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1210
1211         XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
1212         XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
1213
1214         xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1215         xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1216
1217         XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
1218         XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
1219
1220         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1221         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1222         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1223         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1224
1225         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1226         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1227         xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
1228
1229
1230 #ifndef TRMMKERNEL
1231
1232         xvadddp         vs8,    vs8,    vs16
1233         xvadddp         vs9,    vs9,    vs17
1234         xvadddp         vs10,   vs10,   vs18
1235         xvadddp         vs11,   vs11,   vs19
1236
1237 #endif
1238
1239         stxvd2x         vs8,    o0,     T1
1240         stxvd2x         vs9,    o16,    T1
1241         stxvd2x         vs10,   o32,    T1
1242         stxvd2x         vs11,   o48,    T1
1243
1244         add             T1,     T1,     LDC
1245
1246 #ifndef TRMMKERNEL
1247
1248         lxvd2x          vs16,   o0,     T1
1249         lxvd2x          vs17,   o16,    T1
1250         lxvd2x          vs18,   o32,    T1
1251         lxvd2x          vs19,   o48,    T1
1252
1253 #endif
1254
1255
1256         xxlxor          vs0,    vs0,    vs0
1257         xxlxor          vs1,    vs1,    vs1
1258         xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1259
1260         XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
1261         XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
1262
1263         xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1264         xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1265
1266         XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
1267         XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
1268
1269         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1270         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1271         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1272         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1273
1274         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1275         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1276         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
1277
1278
1279
1280         xxlxor          vs0,    vs0,    vs0
1281         xxlxor          vs1,    vs1,    vs1
1282         xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1283
1284         XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
1285         XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
1286
1287         xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1288         xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1289
1290         XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
1291         XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
1292
1293         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1294         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1295         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1296         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1297
1298         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1299         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1300         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
1301
1302
1303
1304         xxlxor          vs0,    vs0,    vs0
1305         xxlxor          vs1,    vs1,    vs1
1306         xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1307
1308         XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
1309         XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
1310
1311         xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1312         xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1313
1314         XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
1315         XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
1316
1317         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1318         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1319         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1320         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1321
1322         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1323         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1324         xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
1325
1326
1327
1328         xxlxor          vs0,    vs0,    vs0
1329         xxlxor          vs1,    vs1,    vs1
1330         xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1331
1332         XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
1333         XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
1334
1335         xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1336         xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1337
1338         XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
1339         XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
1340
1341         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1342         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1343         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1344         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1345
1346         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1347         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1348         xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
1349
1350
1351 #ifndef TRMMKERNEL
1352
1353         xvadddp         vs8,    vs8,    vs16
1354         xvadddp         vs9,    vs9,    vs17
1355         xvadddp         vs10,   vs10,   vs18
1356         xvadddp         vs11,   vs11,   vs19
1357
1358 #endif
1359
1360         stxvd2x         vs8,    o0,     T1
1361         stxvd2x         vs9,    o16,    T1
1362         stxvd2x         vs10,   o32,    T1
1363         stxvd2x         vs11,   o48,    T1
1364
1365         add             T1,     T1,     LDC
1366         addi            CO,     CO,     64
1367
1368 .endm
1369
1370
1371 /**********************************************************************************************
1372 * Macros for N=2 and M=2
1373 **********************************************************************************************/
1374
1375 .macro LOAD2x2_1
1376
1377         lxvdsx          vs16,   o0,     BO              // load real part from B
1378         lxvdsx          vs17,   o8,     BO              // load imag part from B
1379         lxvdsx          vs18,   o16,    BO              // load real part from B
1380         lxvdsx          vs19,   o24,    BO              // load imag part from B
1381
1382         addi            BO,     BO,     32
1383
1384         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1385         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1386
1387         addi            AO,     AO,     32
1388
1389
1390 .endm
1391
1392 .macro KERNEL2x2_I1
1393
1394         lxvd2x          vs8,    o0,     AO              // load real,imag from A
1395         lxvd2x          vs9,    o16,    AO              // load real,imag from A
1396
1397         addi            AO,     AO,     32
1398
1399         lxvdsx          vs20,   o0,     BO              // load real part from B
1400         lxvdsx          vs21,   o8,     BO              // load imag part from B
1401         lxvdsx          vs22,   o16,    BO              // load real part from B
1402         lxvdsx          vs23,   o24,    BO              // load imag part from B
1403
1404         addi            BO,     BO,     32
1405
1406         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
1407         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
1408         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
1409         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
1410
1411         xvmuldp         vs36,   vs0,    vs18            // real*real, imag*real
1412         xvmuldp         vs37,   vs0,    vs19            // real*imag, imag*imag
1413         xvmuldp         vs38,   vs1,    vs18            // real*real, imag*real
1414         xvmuldp         vs39,   vs1,    vs19            // real*imag, imag*imag
1415
1416
1417 .endm
1418
1419 .macro KERNEL2x2_1
1420
1421         lxvd2x          vs8,    o0,     AO              // load real,imag from A
1422         lxvd2x          vs9,    o16,    AO              // load real,imag from A
1423
1424         addi            AO,     AO,     32
1425
1426         lxvdsx          vs20,   o0,     BO              // load real part from B
1427         lxvdsx          vs21,   o8,     BO              // load imag part from B
1428         lxvdsx          vs22,   o16,    BO              // load real part from B
1429         lxvdsx          vs23,   o24,    BO              // load imag part from B
1430
1431         addi            BO,     BO,     32
1432
1433         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
1434         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
1435         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
1436         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
1437
1438         xvmaddadp       vs36,   vs0,    vs18            // real*real, imag*real
1439         xvmaddadp       vs37,   vs0,    vs19            // real*imag, imag*imag
1440         xvmaddadp       vs38,   vs1,    vs18            // real*real, imag*real
1441         xvmaddadp       vs39,   vs1,    vs19            // real*imag, imag*imag
1442
1443
1444 .endm
1445
1446 .macro KERNEL2x2_2
1447
1448         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1449         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1450
1451         addi            AO,     AO,     32
1452
1453         lxvdsx          vs16,   o0,     BO              // load real part from B
1454         lxvdsx          vs17,   o8,     BO              // load imag part from B
1455         lxvdsx          vs18,   o16,    BO              // load real part from B
1456         lxvdsx          vs19,   o24,    BO              // load imag part from B
1457
1458         addi            BO,     BO,     32
1459
1460         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
1461         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
1462         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
1463         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
1464
1465         xvmaddadp       vs36,   vs8,    vs22            // real*real, imag*real
1466         xvmaddadp       vs37,   vs8,    vs23            // real*imag, imag*imag
1467         xvmaddadp       vs38,   vs9,    vs22            // real*real, imag*real
1468         xvmaddadp       vs39,   vs9,    vs23            // real*imag, imag*imag
1469
1470
1471 .endm
1472
1473 .macro KERNEL2x2_E2
1474
1475
1476         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
1477         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
1478         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
1479         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
1480
1481         xvmaddadp       vs36,   vs8,    vs22            // real*real, imag*real
1482         xvmaddadp       vs37,   vs8,    vs23            // real*imag, imag*imag
1483         xvmaddadp       vs38,   vs9,    vs22            // real*real, imag*real
1484         xvmaddadp       vs39,   vs9,    vs23            // real*imag, imag*imag
1485
1486
1487 .endm
1488
1489 .macro KERNEL2x2_SUBI1
1490
1491         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1492         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1493
1494         addi            AO,     AO,     32
1495
1496         lxvdsx          vs16,   o0,     BO              // load real part from B
1497         lxvdsx          vs17,   o8,     BO              // load imag part from B
1498         lxvdsx          vs18,   o16,    BO              // load real part from B
1499         lxvdsx          vs19,   o24,    BO              // load imag part from B
1500
1501         addi            BO,     BO,     32
1502
1503         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
1504         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
1505         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
1506         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
1507
1508         xvmuldp         vs36,   vs0,    vs18            // real*real, imag*real
1509         xvmuldp         vs37,   vs0,    vs19            // real*imag, imag*imag
1510         xvmuldp         vs38,   vs1,    vs18            // real*real, imag*real
1511         xvmuldp         vs39,   vs1,    vs19            // real*imag, imag*imag
1512
1513
1514 .endm
1515
1516 .macro KERNEL2x2_SUB1
1517
1518         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1519         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1520
1521         addi            AO,     AO,     32
1522
1523         lxvdsx          vs16,   o0,     BO              // load real part from B
1524         lxvdsx          vs17,   o8,     BO              // load imag part from B
1525         lxvdsx          vs18,   o16,    BO              // load real part from B
1526         lxvdsx          vs19,   o24,    BO              // load imag part from B
1527
1528         addi            BO,     BO,     32
1529
1530         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
1531         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
1532         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
1533         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
1534
1535         xvmaddadp       vs36,   vs0,    vs18            // real*real, imag*real
1536         xvmaddadp       vs37,   vs0,    vs19            // real*imag, imag*imag
1537         xvmaddadp       vs38,   vs1,    vs18            // real*real, imag*real
1538         xvmaddadp       vs39,   vs1,    vs19            // real*imag, imag*imag
1539
1540
1541 .endm
1542
1543 .macro SAVE2x2
1544
1545
1546         mr              T1,     CO
1547
1548 #ifndef TRMMKERNEL
1549
1550         lxvd2x          vs16,   o0,     T1
1551         lxvd2x          vs17,   o16,    T1
1552
1553 #endif
1554
1555
1556         xxlxor          vs0,    vs0,    vs0
1557         xxlxor          vs1,    vs1,    vs1
1558         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1559
1560         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
1561         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
1562
1563         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1564         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1565
1566         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
1567         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
1568
1569         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1570         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1571         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1572         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1573
1574         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1575         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1576         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
1577
1578
1579
1580         xxlxor          vs0,    vs0,    vs0
1581         xxlxor          vs1,    vs1,    vs1
1582         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1583
1584         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
1585         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
1586
1587         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1588         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1589
1590         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
1591         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
1592
1593         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1594         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1595         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1596         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1597
1598         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1599         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1600         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
1601
1602
1603 #ifndef TRMMKERNEL
1604
1605         xvadddp         vs8,    vs8,    vs16
1606         xvadddp         vs9,    vs9,    vs17
1607
1608 #endif
1609
1610         stxvd2x         vs8,    o0,     T1
1611         stxvd2x         vs9,    o16,    T1
1612
1613         add             T1,     T1,     LDC
1614
1615 #ifndef TRMMKERNEL
1616
1617         lxvd2x          vs16,   o0,     T1
1618         lxvd2x          vs17,   o16,    T1
1619
1620 #endif
1621
1622
1623         xxlxor          vs0,    vs0,    vs0
1624         xxlxor          vs1,    vs1,    vs1
1625         xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1626
1627         XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
1628         XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
1629
1630         xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1631         xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1632
1633         XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
1634         XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
1635
1636         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1637         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1638         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1639         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1640
1641         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1642         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1643         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
1644
1645
1646
1647         xxlxor          vs0,    vs0,    vs0
1648         xxlxor          vs1,    vs1,    vs1
1649         xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1650
1651         XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
1652         XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
1653
1654         xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1655         xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1656
1657         XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
1658         XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
1659
1660         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1661         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1662         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1663         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1664
1665         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1666         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1667         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
1668
1669
1670 #ifndef TRMMKERNEL
1671
1672         xvadddp         vs8,    vs8,    vs16
1673         xvadddp         vs9,    vs9,    vs17
1674
1675 #endif
1676
1677         stxvd2x         vs8,    o0,     T1
1678         stxvd2x         vs9,    o16,    T1
1679
1680         add             T1,     T1,     LDC
1681         addi            CO,     CO,     32
1682
1683 .endm
1684
1685
1686 /**********************************************************************************************
1687 * Macros for N=2 and M=1
1688 **********************************************************************************************/
1689
1690 .macro LOAD2x1_1
1691
1692         lxvdsx          vs16,   o0,     BO              // load real part from B
1693         lxvdsx          vs17,   o8,     BO              // load imag part from B
1694         lxvdsx          vs18,   o16,    BO              // load real part from B
1695         lxvdsx          vs19,   o24,    BO              // load imag part from B
1696
1697         addi            BO,     BO,     32
1698
1699         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1700
1701         addi            AO,     AO,     16
1702
1703
1704 .endm
1705
1706 .macro KERNEL2x1_I1
1707
1708         lxvd2x          vs8,    o0,     AO              // load real,imag from A
1709
1710         addi            AO,     AO,     16
1711
1712         lxvdsx          vs20,   o0,     BO              // load real part from B
1713         lxvdsx          vs21,   o8,     BO              // load imag part from B
1714         lxvdsx          vs22,   o16,    BO              // load real part from B
1715         lxvdsx          vs23,   o24,    BO              // load imag part from B
1716
1717         addi            BO,     BO,     32
1718
1719         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
1720         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
1721
1722         xvmuldp         vs34,   vs0,    vs18            // real*real, imag*real
1723         xvmuldp         vs35,   vs0,    vs19            // real*imag, imag*imag
1724
1725
1726 .endm
1727
1728 .macro KERNEL2x1_1
1729
1730         lxvd2x          vs8,    o0,     AO              // load real,imag from A
1731
1732         addi            AO,     AO,     16
1733
1734         lxvdsx          vs20,   o0,     BO              // load real part from B
1735         lxvdsx          vs21,   o8,     BO              // load imag part from B
1736         lxvdsx          vs22,   o16,    BO              // load real part from B
1737         lxvdsx          vs23,   o24,    BO              // load imag part from B
1738
1739         addi            BO,     BO,     32
1740
1741         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
1742         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
1743
1744         xvmaddadp       vs34,   vs0,    vs18            // real*real, imag*real
1745         xvmaddadp       vs35,   vs0,    vs19            // real*imag, imag*imag
1746
1747
1748 .endm
1749
1750 .macro KERNEL2x1_2
1751
1752         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1753
1754         addi            AO,     AO,     16
1755
1756         lxvdsx          vs16,   o0,     BO              // load real part from B
1757         lxvdsx          vs17,   o8,     BO              // load imag part from B
1758         lxvdsx          vs18,   o16,    BO              // load real part from B
1759         lxvdsx          vs19,   o24,    BO              // load imag part from B
1760
1761         addi            BO,     BO,     32
1762
1763         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
1764         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
1765
1766         xvmaddadp       vs34,   vs8,    vs22            // real*real, imag*real
1767         xvmaddadp       vs35,   vs8,    vs23            // real*imag, imag*imag
1768
1769
1770 .endm
1771
1772 .macro KERNEL2x1_E2
1773
1774
1775         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
1776         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
1777
1778         xvmaddadp       vs34,   vs8,    vs22            // real*real, imag*real
1779         xvmaddadp       vs35,   vs8,    vs23            // real*imag, imag*imag
1780
1781
1782 .endm
1783
1784 .macro KERNEL2x1_SUBI1
1785
1786         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1787
1788         addi            AO,     AO,     16
1789
1790         lxvdsx          vs16,   o0,     BO              // load real part from B
1791         lxvdsx          vs17,   o8,     BO              // load imag part from B
1792         lxvdsx          vs18,   o16,    BO              // load real part from B
1793         lxvdsx          vs19,   o24,    BO              // load imag part from B
1794
1795         addi            BO,     BO,     32
1796
1797         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
1798         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
1799
1800         xvmuldp         vs34,   vs0,    vs18            // real*real, imag*real
1801         xvmuldp         vs35,   vs0,    vs19            // real*imag, imag*imag
1802
1803
1804 .endm
1805
1806 .macro KERNEL2x1_SUB1
1807
1808         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1809
1810         addi            AO,     AO,     16
1811
1812         lxvdsx          vs16,   o0,     BO              // load real part from B
1813         lxvdsx          vs17,   o8,     BO              // load imag part from B
1814         lxvdsx          vs18,   o16,    BO              // load real part from B
1815         lxvdsx          vs19,   o24,    BO              // load imag part from B
1816
1817         addi            BO,     BO,     32
1818
1819         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
1820         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
1821
1822         xvmaddadp       vs34,   vs0,    vs18            // real*real, imag*real
1823         xvmaddadp       vs35,   vs0,    vs19            // real*imag, imag*imag
1824
1825
1826 .endm
1827
1828 .macro SAVE2x1
1829
1830
1831         mr              T1,     CO
1832
1833 #ifndef TRMMKERNEL
1834
1835         lxvd2x          vs16,   o0,     T1
1836
1837 #endif
1838
1839
1840         xxlxor          vs0,    vs0,    vs0
1841         xxlxor          vs1,    vs1,    vs1
1842         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1843
1844         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
1845         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
1846
1847         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1848         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1849
1850         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
1851         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
1852
1853         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1854         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1855         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1856         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1857
1858         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1859         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1860         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
1861
1862
1863 #ifndef TRMMKERNEL
1864
1865         xvadddp         vs8,    vs8,    vs16
1866
1867 #endif
1868
1869         stxvd2x         vs8,    o0,     T1
1870
1871         add             T1,     T1,     LDC
1872
1873 #ifndef TRMMKERNEL
1874
1875         lxvd2x          vs16,   o0,     T1
1876
1877 #endif
1878
1879
1880         xxlxor          vs0,    vs0,    vs0
1881         xxlxor          vs1,    vs1,    vs1
1882         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
1883
1884         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
1885         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
1886
1887         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
1888         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
1889
1890         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
1891         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
1892
1893         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
1894         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
1895         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
1896         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
1897
1898         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
1899         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
1900         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
1901
1902
1903 #ifndef TRMMKERNEL
1904
1905         xvadddp         vs8,    vs8,    vs16
1906
1907 #endif
1908
1909         stxvd2x         vs8,    o0,     T1
1910
1911         add             T1,     T1,     LDC
1912         addi            CO,     CO,     16
1913
1914 .endm
1915
1916
1917 /**********************************************************************************************
1918 * Macros for N=1 and M=8
1919 **********************************************************************************************/
1920
1921 .macro LOAD1x8_1
1922
1923         lxvdsx          vs16,   o0,     BO              // load real part from B
1924         lxvdsx          vs17,   o8,     BO              // load imag part from B
1925
1926         addi            BO,     BO,     16
1927
1928         lxvd2x          vs0,    o0,     AO              // load real,imag from A
1929         lxvd2x          vs1,    o16,    AO              // load real,imag from A
1930         lxvd2x          vs2,    o32,    AO              // load real,imag from A
1931         lxvd2x          vs3,    o48,    AO              // load real,imag from A
1932
1933         addi            AO,     AO,     64
1934
1935         lxvd2x          vs4,    o0,     AO              // load real,imag from A
1936         lxvd2x          vs5,    o16,    AO              // load real,imag from A
1937         lxvd2x          vs6,    o32,    AO              // load real,imag from A
1938         lxvd2x          vs7,    o48,    AO              // load real,imag from A
1939
1940         addi            AO,     AO,     64
1941
1942
1943 .endm
1944
1945 .macro KERNEL1x8_I1
1946
1947         lxvd2x          vs8,    o0,     AO              // load real,imag from A
1948         lxvd2x          vs9,    o16,    AO              // load real,imag from A
1949         lxvd2x          vs10,   o32,    AO              // load real,imag from A
1950         lxvd2x          vs11,   o48,    AO              // load real,imag from A
1951
1952         addi            AO,     AO,     64
1953
1954         lxvd2x          vs12,   o0,     AO              // load real,imag from A
1955         lxvd2x          vs13,   o16,    AO              // load real,imag from A
1956         lxvd2x          vs14,   o32,    AO              // load real,imag from A
1957         lxvd2x          vs15,   o48,    AO              // load real,imag from A
1958
1959         addi            AO,     AO,     64
1960
1961         lxvdsx          vs20,   o0,     BO              // load real part from B
1962         lxvdsx          vs21,   o8,     BO              // load imag part from B
1963
1964         addi            BO,     BO,     16
1965
1966         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
1967         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
1968         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
1969         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
1970         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
1971         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
1972         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
1973         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
1974         xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
1975         xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
1976         xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
1977         xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
1978         xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
1979         xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
1980         xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
1981         xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
1982
1983
1984 .endm
1985
1986 .macro KERNEL1x8_1
1987
1988         lxvd2x          vs8,    o0,     AO              // load real,imag from A
1989         lxvd2x          vs9,    o16,    AO              // load real,imag from A
1990         lxvd2x          vs10,   o32,    AO              // load real,imag from A
1991         lxvd2x          vs11,   o48,    AO              // load real,imag from A
1992
1993         addi            AO,     AO,     64
1994
1995         lxvd2x          vs12,   o0,     AO              // load real,imag from A
1996         lxvd2x          vs13,   o16,    AO              // load real,imag from A
1997         lxvd2x          vs14,   o32,    AO              // load real,imag from A
1998         lxvd2x          vs15,   o48,    AO              // load real,imag from A
1999
2000         addi            AO,     AO,     64
2001
2002         lxvdsx          vs20,   o0,     BO              // load real part from B
2003         lxvdsx          vs21,   o8,     BO              // load imag part from B
2004
2005         addi            BO,     BO,     16
2006
2007         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2008         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2009         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
2010         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
2011         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
2012         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
2013         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
2014         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
2015         xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
2016         xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
2017         xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
2018         xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
2019         xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
2020         xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
2021         xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
2022         xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
2023
2024
2025 .endm
2026
2027 .macro KERNEL1x8_2
2028
2029         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2030         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2031         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2032         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2033
2034         addi            AO,     AO,     64
2035
2036         lxvd2x          vs4,    o0,     AO              // load real,imag from A
2037         lxvd2x          vs5,    o16,    AO              // load real,imag from A
2038         lxvd2x          vs6,    o32,    AO              // load real,imag from A
2039         lxvd2x          vs7,    o48,    AO              // load real,imag from A
2040
2041         addi            AO,     AO,     64
2042
2043         lxvdsx          vs16,   o0,     BO              // load real part from B
2044         lxvdsx          vs17,   o8,     BO              // load imag part from B
2045
2046         addi            BO,     BO,     16
2047
2048         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2049         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2050         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
2051         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
2052         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
2053         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
2054         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
2055         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
2056         xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
2057         xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
2058         xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
2059         xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
2060         xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
2061         xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
2062         xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
2063         xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
2064
2065
2066 .endm
2067
2068 .macro KERNEL1x8_E2
2069
2070
2071         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2072         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2073         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
2074         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
2075         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
2076         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
2077         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
2078         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
2079         xvmaddadp       vs40,   vs12,   vs20            // real*real, imag*real
2080         xvmaddadp       vs41,   vs12,   vs21            // real*imag, imag*imag
2081         xvmaddadp       vs42,   vs13,   vs20            // real*real, imag*real
2082         xvmaddadp       vs43,   vs13,   vs21            // real*imag, imag*imag
2083         xvmaddadp       vs44,   vs14,   vs20            // real*real, imag*real
2084         xvmaddadp       vs45,   vs14,   vs21            // real*imag, imag*imag
2085         xvmaddadp       vs46,   vs15,   vs20            // real*real, imag*real
2086         xvmaddadp       vs47,   vs15,   vs21            // real*imag, imag*imag
2087
2088
2089 .endm
2090
2091 .macro KERNEL1x8_SUBI1
2092
2093         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2094         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2095         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2096         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2097
2098         addi            AO,     AO,     64
2099
2100         lxvd2x          vs4,    o0,     AO              // load real,imag from A
2101         lxvd2x          vs5,    o16,    AO              // load real,imag from A
2102         lxvd2x          vs6,    o32,    AO              // load real,imag from A
2103         lxvd2x          vs7,    o48,    AO              // load real,imag from A
2104
2105         addi            AO,     AO,     64
2106
2107         lxvdsx          vs16,   o0,     BO              // load real part from B
2108         lxvdsx          vs17,   o8,     BO              // load imag part from B
2109
2110         addi            BO,     BO,     16
2111
2112         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
2113         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
2114         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
2115         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
2116         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
2117         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
2118         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
2119         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
2120         xvmuldp         vs40,   vs4,    vs16            // real*real, imag*real
2121         xvmuldp         vs41,   vs4,    vs17            // real*imag, imag*imag
2122         xvmuldp         vs42,   vs5,    vs16            // real*real, imag*real
2123         xvmuldp         vs43,   vs5,    vs17            // real*imag, imag*imag
2124         xvmuldp         vs44,   vs6,    vs16            // real*real, imag*real
2125         xvmuldp         vs45,   vs6,    vs17            // real*imag, imag*imag
2126         xvmuldp         vs46,   vs7,    vs16            // real*real, imag*real
2127         xvmuldp         vs47,   vs7,    vs17            // real*imag, imag*imag
2128
2129
2130 .endm
2131
2132 .macro KERNEL1x8_SUB1
2133
2134         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2135         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2136         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2137         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2138
2139         addi            AO,     AO,     64
2140
2141         lxvd2x          vs4,    o0,     AO              // load real,imag from A
2142         lxvd2x          vs5,    o16,    AO              // load real,imag from A
2143         lxvd2x          vs6,    o32,    AO              // load real,imag from A
2144         lxvd2x          vs7,    o48,    AO              // load real,imag from A
2145
2146         addi            AO,     AO,     64
2147
2148         lxvdsx          vs16,   o0,     BO              // load real part from B
2149         lxvdsx          vs17,   o8,     BO              // load imag part from B
2150
2151         addi            BO,     BO,     16
2152
2153         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2154         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2155         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
2156         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
2157         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
2158         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
2159         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
2160         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
2161         xvmaddadp       vs40,   vs4,    vs16            // real*real, imag*real
2162         xvmaddadp       vs41,   vs4,    vs17            // real*imag, imag*imag
2163         xvmaddadp       vs42,   vs5,    vs16            // real*real, imag*real
2164         xvmaddadp       vs43,   vs5,    vs17            // real*imag, imag*imag
2165         xvmaddadp       vs44,   vs6,    vs16            // real*real, imag*real
2166         xvmaddadp       vs45,   vs6,    vs17            // real*imag, imag*imag
2167         xvmaddadp       vs46,   vs7,    vs16            // real*real, imag*real
2168         xvmaddadp       vs47,   vs7,    vs17            // real*imag, imag*imag
2169
2170
2171 .endm
2172
2173 .macro SAVE1x8
2174
2175
2176         mr              T1,     CO
2177         addi            T2,     T1,     64
2178
2179 #ifndef TRMMKERNEL
2180
2181         lxvd2x          vs16,   o0,     T1
2182         lxvd2x          vs17,   o16,    T1
2183         lxvd2x          vs18,   o32,    T1
2184         lxvd2x          vs19,   o48,    T1
2185         lxvd2x          vs20,   o0,     T2
2186         lxvd2x          vs21,   o16,    T2
2187         lxvd2x          vs22,   o32,    T2
2188         lxvd2x          vs23,   o48,    T2
2189
2190 #endif
2191
2192
2193         xxlxor          vs0,    vs0,    vs0
2194         xxlxor          vs1,    vs1,    vs1
2195         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2196
2197         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
2198         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
2199
2200         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2201         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2202
2203         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
2204         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
2205
2206         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2207         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2208         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2209         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2210
2211         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2212         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2213         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
2214
2215
2216
2217         xxlxor          vs0,    vs0,    vs0
2218         xxlxor          vs1,    vs1,    vs1
2219         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2220
2221         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
2222         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
2223
2224         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2225         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2226
2227         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
2228         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
2229
2230         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2231         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2232         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2233         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2234
2235         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2236         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2237         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
2238
2239
2240
2241         xxlxor          vs0,    vs0,    vs0
2242         xxlxor          vs1,    vs1,    vs1
2243         xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2244
2245         XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
2246         XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
2247
2248         xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2249         xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2250
2251         XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
2252         XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
2253
2254         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2255         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2256         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2257         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2258
2259         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2260         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2261         xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
2262
2263
2264
2265         xxlxor          vs0,    vs0,    vs0
2266         xxlxor          vs1,    vs1,    vs1
2267         xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2268
2269         XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
2270         XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
2271
2272         xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2273         xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2274
2275         XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
2276         XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
2277
2278         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2279         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2280         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2281         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2282
2283         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2284         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2285         xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
2286
2287
2288
2289         xxlxor          vs0,    vs0,    vs0
2290         xxlxor          vs1,    vs1,    vs1
2291         xxswapd         vs41,   vs41                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2292
2293         XSFADD_R1       vs0,    vs0,    vs40            // realA*realB
2294         XSFADD_R2       vs0,    vs0,    vs41            // imagA*imagB
2295
2296         xxswapd         vs40,   vs40                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2297         xxswapd         vs41,   vs41                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2298
2299         XSFADD_I1       vs1,    vs1,    vs40            // realA*imagB
2300         XSFADD_I2       vs1,    vs1,    vs41            // imagA*realB
2301
2302         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2303         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2304         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2305         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2306
2307         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2308         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2309         xxpermdi        vs12,   vs2,    vs3,    0       // merge real and imag part
2310
2311
2312
2313         xxlxor          vs0,    vs0,    vs0
2314         xxlxor          vs1,    vs1,    vs1
2315         xxswapd         vs43,   vs43                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2316
2317         XSFADD_R1       vs0,    vs0,    vs42            // realA*realB
2318         XSFADD_R2       vs0,    vs0,    vs43            // imagA*imagB
2319
2320         xxswapd         vs42,   vs42                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2321         xxswapd         vs43,   vs43                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2322
2323         XSFADD_I1       vs1,    vs1,    vs42            // realA*imagB
2324         XSFADD_I2       vs1,    vs1,    vs43            // imagA*realB
2325
2326         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2327         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2328         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2329         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2330
2331         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2332         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2333         xxpermdi        vs13,   vs2,    vs3,    0       // merge real and imag part
2334
2335
2336
2337         xxlxor          vs0,    vs0,    vs0
2338         xxlxor          vs1,    vs1,    vs1
2339         xxswapd         vs45,   vs45                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2340
2341         XSFADD_R1       vs0,    vs0,    vs44            // realA*realB
2342         XSFADD_R2       vs0,    vs0,    vs45            // imagA*imagB
2343
2344         xxswapd         vs44,   vs44                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2345         xxswapd         vs45,   vs45                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2346
2347         XSFADD_I1       vs1,    vs1,    vs44            // realA*imagB
2348         XSFADD_I2       vs1,    vs1,    vs45            // imagA*realB
2349
2350         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2351         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2352         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2353         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2354
2355         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2356         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2357         xxpermdi        vs14,   vs2,    vs3,    0       // merge real and imag part
2358
2359
2360
2361         xxlxor          vs0,    vs0,    vs0
2362         xxlxor          vs1,    vs1,    vs1
2363         xxswapd         vs47,   vs47                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2364
2365         XSFADD_R1       vs0,    vs0,    vs46            // realA*realB
2366         XSFADD_R2       vs0,    vs0,    vs47            // imagA*imagB
2367
2368         xxswapd         vs46,   vs46                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2369         xxswapd         vs47,   vs47                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2370
2371         XSFADD_I1       vs1,    vs1,    vs46            // realA*imagB
2372         XSFADD_I2       vs1,    vs1,    vs47            // imagA*realB
2373
2374         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2375         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2376         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2377         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2378
2379         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2380         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2381         xxpermdi        vs15,   vs2,    vs3,    0       // merge real and imag part
2382
2383
2384 #ifndef TRMMKERNEL
2385
2386         xvadddp         vs8,    vs8,    vs16
2387         xvadddp         vs9,    vs9,    vs17
2388         xvadddp         vs10,   vs10,   vs18
2389         xvadddp         vs11,   vs11,   vs19
2390         xvadddp         vs12,   vs12,   vs20
2391         xvadddp         vs13,   vs13,   vs21
2392         xvadddp         vs14,   vs14,   vs22
2393         xvadddp         vs15,   vs15,   vs23
2394
2395 #endif
2396
2397         stxvd2x         vs8,    o0,     T1
2398         stxvd2x         vs9,    o16,    T1
2399         stxvd2x         vs10,   o32,    T1
2400         stxvd2x         vs11,   o48,    T1
2401         stxvd2x         vs12,   o0,     T2
2402         stxvd2x         vs13,   o16,    T2
2403         stxvd2x         vs14,   o32,    T2
2404         stxvd2x         vs15,   o48,    T2
2405
2406         add             T1,     T1,     LDC
2407         add             T2,     T2,     LDC
2408         addi            CO,     CO,     128
2409
2410 .endm
2411
2412
2413 /**********************************************************************************************
2414 * Macros for N=1 and M=4
2415 **********************************************************************************************/
2416
2417 .macro LOAD1x4_1
2418
2419         lxvdsx          vs16,   o0,     BO              // load real part from B
2420         lxvdsx          vs17,   o8,     BO              // load imag part from B
2421
2422         addi            BO,     BO,     16
2423
2424         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2425         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2426         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2427         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2428
2429         addi            AO,     AO,     64
2430
2431
2432 .endm
2433
2434 .macro KERNEL1x4_I1
2435
2436         lxvd2x          vs8,    o0,     AO              // load real,imag from A
2437         lxvd2x          vs9,    o16,    AO              // load real,imag from A
2438         lxvd2x          vs10,   o32,    AO              // load real,imag from A
2439         lxvd2x          vs11,   o48,    AO              // load real,imag from A
2440
2441         addi            AO,     AO,     64
2442
2443         lxvdsx          vs20,   o0,     BO              // load real part from B
2444         lxvdsx          vs21,   o8,     BO              // load imag part from B
2445
2446         addi            BO,     BO,     16
2447
2448         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
2449         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
2450         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
2451         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
2452         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
2453         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
2454         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
2455         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
2456
2457
2458 .endm
2459
2460 .macro KERNEL1x4_1
2461
2462         lxvd2x          vs8,    o0,     AO              // load real,imag from A
2463         lxvd2x          vs9,    o16,    AO              // load real,imag from A
2464         lxvd2x          vs10,   o32,    AO              // load real,imag from A
2465         lxvd2x          vs11,   o48,    AO              // load real,imag from A
2466
2467         addi            AO,     AO,     64
2468
2469         lxvdsx          vs20,   o0,     BO              // load real part from B
2470         lxvdsx          vs21,   o8,     BO              // load imag part from B
2471
2472         addi            BO,     BO,     16
2473
2474         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2475         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2476         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
2477         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
2478         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
2479         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
2480         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
2481         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
2482
2483
2484 .endm
2485
2486 .macro KERNEL1x4_2
2487
2488         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2489         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2490         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2491         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2492
2493         addi            AO,     AO,     64
2494
2495         lxvdsx          vs16,   o0,     BO              // load real part from B
2496         lxvdsx          vs17,   o8,     BO              // load imag part from B
2497
2498         addi            BO,     BO,     16
2499
2500         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2501         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2502         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
2503         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
2504         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
2505         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
2506         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
2507         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
2508
2509
2510 .endm
2511
2512 .macro KERNEL1x4_E2
2513
2514
2515         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2516         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2517         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
2518         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
2519         xvmaddadp       vs36,   vs10,   vs20            // real*real, imag*real
2520         xvmaddadp       vs37,   vs10,   vs21            // real*imag, imag*imag
2521         xvmaddadp       vs38,   vs11,   vs20            // real*real, imag*real
2522         xvmaddadp       vs39,   vs11,   vs21            // real*imag, imag*imag
2523
2524
2525 .endm
2526
2527 .macro KERNEL1x4_SUBI1
2528
2529         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2530         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2531         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2532         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2533
2534         addi            AO,     AO,     64
2535
2536         lxvdsx          vs16,   o0,     BO              // load real part from B
2537         lxvdsx          vs17,   o8,     BO              // load imag part from B
2538
2539         addi            BO,     BO,     16
2540
2541         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
2542         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
2543         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
2544         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
2545         xvmuldp         vs36,   vs2,    vs16            // real*real, imag*real
2546         xvmuldp         vs37,   vs2,    vs17            // real*imag, imag*imag
2547         xvmuldp         vs38,   vs3,    vs16            // real*real, imag*real
2548         xvmuldp         vs39,   vs3,    vs17            // real*imag, imag*imag
2549
2550
2551 .endm
2552
2553 .macro KERNEL1x4_SUB1
2554
2555         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2556         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2557         lxvd2x          vs2,    o32,    AO              // load real,imag from A
2558         lxvd2x          vs3,    o48,    AO              // load real,imag from A
2559
2560         addi            AO,     AO,     64
2561
2562         lxvdsx          vs16,   o0,     BO              // load real part from B
2563         lxvdsx          vs17,   o8,     BO              // load imag part from B
2564
2565         addi            BO,     BO,     16
2566
2567         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2568         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2569         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
2570         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
2571         xvmaddadp       vs36,   vs2,    vs16            // real*real, imag*real
2572         xvmaddadp       vs37,   vs2,    vs17            // real*imag, imag*imag
2573         xvmaddadp       vs38,   vs3,    vs16            // real*real, imag*real
2574         xvmaddadp       vs39,   vs3,    vs17            // real*imag, imag*imag
2575
2576
2577 .endm
2578
2579 .macro SAVE1x4
2580
2581
2582         mr              T1,     CO
2583
2584 #ifndef TRMMKERNEL
2585
2586         lxvd2x          vs16,   o0,     T1
2587         lxvd2x          vs17,   o16,    T1
2588         lxvd2x          vs18,   o32,    T1
2589         lxvd2x          vs19,   o48,    T1
2590
2591 #endif
2592
2593
2594         xxlxor          vs0,    vs0,    vs0
2595         xxlxor          vs1,    vs1,    vs1
2596         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2597
2598         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
2599         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
2600
2601         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2602         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2603
2604         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
2605         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
2606
2607         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2608         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2609         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2610         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2611
2612         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2613         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2614         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
2615
2616
2617
2618         xxlxor          vs0,    vs0,    vs0
2619         xxlxor          vs1,    vs1,    vs1
2620         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2621
2622         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
2623         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
2624
2625         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2626         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2627
2628         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
2629         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
2630
2631         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2632         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2633         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2634         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2635
2636         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2637         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2638         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
2639
2640
2641
2642         xxlxor          vs0,    vs0,    vs0
2643         xxlxor          vs1,    vs1,    vs1
2644         xxswapd         vs37,   vs37                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2645
2646         XSFADD_R1       vs0,    vs0,    vs36            // realA*realB
2647         XSFADD_R2       vs0,    vs0,    vs37            // imagA*imagB
2648
2649         xxswapd         vs36,   vs36                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2650         xxswapd         vs37,   vs37                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2651
2652         XSFADD_I1       vs1,    vs1,    vs36            // realA*imagB
2653         XSFADD_I2       vs1,    vs1,    vs37            // imagA*realB
2654
2655         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2656         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2657         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2658         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2659
2660         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2661         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2662         xxpermdi        vs10,   vs2,    vs3,    0       // merge real and imag part
2663
2664
2665
2666         xxlxor          vs0,    vs0,    vs0
2667         xxlxor          vs1,    vs1,    vs1
2668         xxswapd         vs39,   vs39                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2669
2670         XSFADD_R1       vs0,    vs0,    vs38            // realA*realB
2671         XSFADD_R2       vs0,    vs0,    vs39            // imagA*imagB
2672
2673         xxswapd         vs38,   vs38                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2674         xxswapd         vs39,   vs39                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2675
2676         XSFADD_I1       vs1,    vs1,    vs38            // realA*imagB
2677         XSFADD_I2       vs1,    vs1,    vs39            // imagA*realB
2678
2679         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2680         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2681         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2682         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2683
2684         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2685         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2686         xxpermdi        vs11,   vs2,    vs3,    0       // merge real and imag part
2687
2688
2689 #ifndef TRMMKERNEL
2690
2691         xvadddp         vs8,    vs8,    vs16
2692         xvadddp         vs9,    vs9,    vs17
2693         xvadddp         vs10,   vs10,   vs18
2694         xvadddp         vs11,   vs11,   vs19
2695
2696 #endif
2697
2698         stxvd2x         vs8,    o0,     T1
2699         stxvd2x         vs9,    o16,    T1
2700         stxvd2x         vs10,   o32,    T1
2701         stxvd2x         vs11,   o48,    T1
2702
2703         add             T1,     T1,     LDC
2704         addi            CO,     CO,     64
2705
2706 .endm
2707
2708
2709 /**********************************************************************************************
2710 * Macros for N=1 and M=2
2711 **********************************************************************************************/
2712
2713 .macro LOAD1x2_1
2714
2715         lxvdsx          vs16,   o0,     BO              // load real part from B
2716         lxvdsx          vs17,   o8,     BO              // load imag part from B
2717
2718         addi            BO,     BO,     16
2719
2720         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2721         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2722
2723         addi            AO,     AO,     32
2724
2725
2726 .endm
2727
2728 .macro KERNEL1x2_I1
2729
2730         lxvd2x          vs8,    o0,     AO              // load real,imag from A
2731         lxvd2x          vs9,    o16,    AO              // load real,imag from A
2732
2733         addi            AO,     AO,     32
2734
2735         lxvdsx          vs20,   o0,     BO              // load real part from B
2736         lxvdsx          vs21,   o8,     BO              // load imag part from B
2737
2738         addi            BO,     BO,     16
2739
2740         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
2741         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
2742         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
2743         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
2744
2745
2746 .endm
2747
2748 .macro KERNEL1x2_1
2749
2750         lxvd2x          vs8,    o0,     AO              // load real,imag from A
2751         lxvd2x          vs9,    o16,    AO              // load real,imag from A
2752
2753         addi            AO,     AO,     32
2754
2755         lxvdsx          vs20,   o0,     BO              // load real part from B
2756         lxvdsx          vs21,   o8,     BO              // load imag part from B
2757
2758         addi            BO,     BO,     16
2759
2760         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2761         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2762         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
2763         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
2764
2765
2766 .endm
2767
2768 .macro KERNEL1x2_2
2769
2770         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2771         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2772
2773         addi            AO,     AO,     32
2774
2775         lxvdsx          vs16,   o0,     BO              // load real part from B
2776         lxvdsx          vs17,   o8,     BO              // load imag part from B
2777
2778         addi            BO,     BO,     16
2779
2780         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2781         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2782         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
2783         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
2784
2785
2786 .endm
2787
2788 .macro KERNEL1x2_E2
2789
2790
2791         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2792         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2793         xvmaddadp       vs34,   vs9,    vs20            // real*real, imag*real
2794         xvmaddadp       vs35,   vs9,    vs21            // real*imag, imag*imag
2795
2796
2797 .endm
2798
2799 .macro KERNEL1x2_SUBI1
2800
2801         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2802         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2803
2804         addi            AO,     AO,     32
2805
2806         lxvdsx          vs16,   o0,     BO              // load real part from B
2807         lxvdsx          vs17,   o8,     BO              // load imag part from B
2808
2809         addi            BO,     BO,     16
2810
2811         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
2812         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
2813         xvmuldp         vs34,   vs1,    vs16            // real*real, imag*real
2814         xvmuldp         vs35,   vs1,    vs17            // real*imag, imag*imag
2815
2816
2817 .endm
2818
2819 .macro KERNEL1x2_SUB1
2820
2821         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2822         lxvd2x          vs1,    o16,    AO              // load real,imag from A
2823
2824         addi            AO,     AO,     32
2825
2826         lxvdsx          vs16,   o0,     BO              // load real part from B
2827         lxvdsx          vs17,   o8,     BO              // load imag part from B
2828
2829         addi            BO,     BO,     16
2830
2831         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2832         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2833         xvmaddadp       vs34,   vs1,    vs16            // real*real, imag*real
2834         xvmaddadp       vs35,   vs1,    vs17            // real*imag, imag*imag
2835
2836
2837 .endm
2838
2839 .macro SAVE1x2
2840
2841
2842         mr              T1,     CO
2843
2844 #ifndef TRMMKERNEL
2845
2846         lxvd2x          vs16,   o0,     T1
2847         lxvd2x          vs17,   o16,    T1
2848
2849 #endif
2850
2851
2852         xxlxor          vs0,    vs0,    vs0
2853         xxlxor          vs1,    vs1,    vs1
2854         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2855
2856         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
2857         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
2858
2859         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2860         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2861
2862         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
2863         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
2864
2865         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2866         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2867         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2868         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2869
2870         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2871         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2872         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
2873
2874
2875
2876         xxlxor          vs0,    vs0,    vs0
2877         xxlxor          vs1,    vs1,    vs1
2878         xxswapd         vs35,   vs35                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
2879
2880         XSFADD_R1       vs0,    vs0,    vs34            // realA*realB
2881         XSFADD_R2       vs0,    vs0,    vs35            // imagA*imagB
2882
2883         xxswapd         vs34,   vs34                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
2884         xxswapd         vs35,   vs35                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
2885
2886         XSFADD_I1       vs1,    vs1,    vs34            // realA*imagB
2887         XSFADD_I2       vs1,    vs1,    vs35            // imagA*realB
2888
2889         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
2890         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
2891         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
2892         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
2893
2894         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
2895         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
2896         xxpermdi        vs9,    vs2,    vs3,    0       // merge real and imag part
2897
2898
2899 #ifndef TRMMKERNEL
2900
2901         xvadddp         vs8,    vs8,    vs16
2902         xvadddp         vs9,    vs9,    vs17
2903
2904 #endif
2905
2906         stxvd2x         vs8,    o0,     T1
2907         stxvd2x         vs9,    o16,    T1
2908
2909         add             T1,     T1,     LDC
2910         addi            CO,     CO,     32
2911
2912 .endm
2913
2914
2915 /**********************************************************************************************
2916 * Macros for N=1 and M=1
2917 **********************************************************************************************/
2918
2919 .macro LOAD1x1_1
2920
2921         lxvdsx          vs16,   o0,     BO              // load real part from B
2922         lxvdsx          vs17,   o8,     BO              // load imag part from B
2923
2924         addi            BO,     BO,     16
2925
2926         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2927
2928         addi            AO,     AO,     16
2929
2930
2931 .endm
2932
2933 .macro KERNEL1x1_I1
2934
2935         lxvd2x          vs8,    o0,     AO              // load real,imag from A
2936
2937         addi            AO,     AO,     16
2938
2939         lxvdsx          vs20,   o0,     BO              // load real part from B
2940         lxvdsx          vs21,   o8,     BO              // load imag part from B
2941
2942         addi            BO,     BO,     16
2943
2944         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
2945         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
2946
2947
2948 .endm
2949
2950 .macro KERNEL1x1_1
2951
2952         lxvd2x          vs8,    o0,     AO              // load real,imag from A
2953
2954         addi            AO,     AO,     16
2955
2956         lxvdsx          vs20,   o0,     BO              // load real part from B
2957         lxvdsx          vs21,   o8,     BO              // load imag part from B
2958
2959         addi            BO,     BO,     16
2960
2961         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
2962         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
2963
2964
2965 .endm
2966
2967 .macro KERNEL1x1_2
2968
2969         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2970
2971         addi            AO,     AO,     16
2972
2973         lxvdsx          vs16,   o0,     BO              // load real part from B
2974         lxvdsx          vs17,   o8,     BO              // load imag part from B
2975
2976         addi            BO,     BO,     16
2977
2978         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2979         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2980
2981
2982 .endm
2983
2984 .macro KERNEL1x1_E2
2985
2986
2987         xvmaddadp       vs32,   vs8,    vs20            // real*real, imag*real
2988         xvmaddadp       vs33,   vs8,    vs21            // real*imag, imag*imag
2989
2990
2991 .endm
2992
2993 .macro KERNEL1x1_SUBI1
2994
2995         lxvd2x          vs0,    o0,     AO              // load real,imag from A
2996
2997         addi            AO,     AO,     16
2998
2999         lxvdsx          vs16,   o0,     BO              // load real part from B
3000         lxvdsx          vs17,   o8,     BO              // load imag part from B
3001
3002         addi            BO,     BO,     16
3003
3004         xvmuldp         vs32,   vs0,    vs16            // real*real, imag*real
3005         xvmuldp         vs33,   vs0,    vs17            // real*imag, imag*imag
3006
3007
3008 .endm
3009
3010 .macro KERNEL1x1_SUB1
3011
3012         lxvd2x          vs0,    o0,     AO              // load real,imag from A
3013
3014         addi            AO,     AO,     16
3015
3016         lxvdsx          vs16,   o0,     BO              // load real part from B
3017         lxvdsx          vs17,   o8,     BO              // load imag part from B
3018
3019         addi            BO,     BO,     16
3020
3021         xvmaddadp       vs32,   vs0,    vs16            // real*real, imag*real
3022         xvmaddadp       vs33,   vs0,    vs17            // real*imag, imag*imag
3023
3024
3025 .endm
3026
3027 .macro SAVE1x1
3028
3029
3030         mr              T1,     CO
3031
3032 #ifndef TRMMKERNEL
3033
3034         lxvd2x          vs16,   o0,     T1
3035
3036 #endif
3037
3038
3039         xxlxor          vs0,    vs0,    vs0
3040         xxlxor          vs1,    vs1,    vs1
3041         xxswapd         vs33,   vs33                    // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
3042
3043         XSFADD_R1       vs0,    vs0,    vs32            // realA*realB
3044         XSFADD_R2       vs0,    vs0,    vs33            // imagA*imagB
3045
3046         xxswapd         vs32,   vs32                    // realA*realB, imagA*realB -> imagA*realB, realA*realB
3047         xxswapd         vs33,   vs33                    // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
3048
3049         XSFADD_I1       vs1,    vs1,    vs32            // realA*imagB
3050         XSFADD_I2       vs1,    vs1,    vs33            // imagA*realB
3051
3052         xsmuldp         vs4,    vs0,    alpha_r         // real*alpha_r
3053         xsmuldp         vs5,    vs1,    alpha_i         // imag*alpha_i
3054         xsmuldp         vs6,    vs0,    alpha_i         // real*alpha_i
3055         xsmuldp         vs7,    vs1,    alpha_r         // imag*alpha_r
3056
3057         xssubdp         vs2,    vs4,    vs5             // real*alpha_r - imag*alpha_i
3058         xsadddp         vs3,    vs6,    vs7             // real*alpha_i + imag*alpha_r
3059         xxpermdi        vs8,    vs2,    vs3,    0       // merge real and imag part
3060
3061
3062 #ifndef TRMMKERNEL
3063
3064         xvadddp         vs8,    vs8,    vs16
3065
3066 #endif
3067
3068         stxvd2x         vs8,    o0,     T1
3069
3070         add             T1,     T1,     LDC
3071         addi            CO,     CO,     16
3072
3073 .endm
3074