1 dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C * Improve ad-hoc outer loop code and register handling. Some feed-in
24 C scheduling could improve things by several cycles per outer iteration.
25 C * In Lam3...Lam1 code for, keep accumulation operands in registers, without
26 C storing intermediates to rp.
27 C * We might want to keep 32 in a free mm register, since the register form is
28 C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save.
29 C * Look into different loop alignment, we now expand the code about 50 bytes
30 C with possibly needless alignment.
31 C * Use OSP, should solve feed-in latency problems.
32 C * Address relative slowness for un<=3 for Pentium M. The old code is there
33 C considerably faster. (1:20/14, 2:34:32, 3:66/57)
42 PROLOGUE(mpn_sqr_basecase)
43 mov 4(%esp), %edx C rp
44 mov 8(%esp), %eax C up
45 mov 12(%esp), %ecx C un
55 L(un1): mov (%eax), %eax
61 L(un2): movd (%eax), %mm0 C un=2
62 movd (%eax), %mm2 C un=2
63 movd 4(%eax), %mm1 C un=2
64 pmuludq %mm0, %mm0 C 64b weight 0 un=2
65 pmuludq %mm1, %mm2 C 64b weight 32 un=2
66 pmuludq %mm1, %mm1 C 64b weight 64 un=2
67 movd %mm0, (%edx) C un=2
68 psrlq $32, %mm0 C 32b weight 32 un=2
69 pcmpeqd %mm7, %mm7 C un=2
70 psrlq $33, %mm7 C 0x000000007FFFFFFF un=2
71 pand %mm2, %mm7 C 31b weight 32 un=2
72 psrlq $31, %mm2 C 33b weight 65 un=2
73 psllq $1, %mm7 C 31b weight 33 un=2
74 paddq %mm7, %mm0 C un=2
75 movd %mm0, 4(%edx) C un=2
76 psrlq $32, %mm0 C un=2
77 paddq %mm2, %mm1 C un=2
78 paddq %mm0, %mm1 C un=2
79 movd %mm1, 8(%edx) C un=2
80 psrlq $32, %mm1 C un=2
81 movd %mm1, 12(%edx) C un=2
84 L(un3): movd (%eax), %mm7 C un=3
85 movd 4(%eax), %mm6 C un=3
86 pmuludq %mm7, %mm6 C un=3
87 movd 8(%eax), %mm2 C un=3
88 pmuludq %mm7, %mm2 C un=3
89 movd %mm6, 4(%edx) C un=3
90 psrlq $32, %mm6 C un=3
91 paddq %mm2, %mm6 C un=3
92 movd %mm6, 8(%edx) C un=3
93 psrlq $32, %mm6 C un=3
94 movd %mm6, 12(%edx) C un=3
95 lea 4(%edx), %edx C un=3
96 lea 4(%eax), %eax C un=3
98 L(un4): movd (%eax), %mm7 C un=4
99 movd 4(%eax), %mm6 C un=4
100 pmuludq %mm7, %mm6 C un=4
101 movd 8(%eax), %mm0 C un=4
102 pmuludq %mm7, %mm0 C un=4
103 movd 12(%eax), %mm1 C un=4
104 pmuludq %mm7, %mm1 C un=4
105 movd %mm6, 4(%edx) C un=4
106 psrlq $32, %mm6 C un=4
107 paddq %mm0, %mm6 C un=4
108 movd %mm6, 8(%edx) C un=4
109 psrlq $32, %mm6 C un=4
110 paddq %mm1, %mm6 C un=4
111 movd %mm6, 12(%edx) C un=4
112 psrlq $32, %mm6 C un=4
113 movd %mm6, 16(%edx) C un=4
114 lea 4(%edx), %edx C un=4
115 lea 4(%eax), %eax C un=4
123 lea 4(%eax), %esi C init up, up++
124 lea 4(%eax), %eax C up2++ FIXME: should fix offsets
125 lea 4(%edx), %edi C init rp, rp++
126 lea 4(%edx), %edx C rp2++
127 lea -4(%ecx), %ebx C loop count
135 movd (%eax), %mm4 C m 1
136 lea (%ebx), %ecx C inner loop count m 1
137 pmuludq %mm7, %mm4 C m 1
138 movd 4(%eax), %mm3 C m 1
139 pmuludq %mm7, %mm3 C m 1
140 movd 8(%eax), %mm0 C m 1
144 pmuludq %mm7, %mm4 C m 1
145 paddq %mm0, %mm6 C m 1
146 movd 4(%eax), %mm3 C m 1
147 movd %mm6, -8(%edx) C m 1
148 psrlq $32, %mm6 C m 1
149 pmuludq %mm7, %mm3 C m 1
150 paddq %mm1, %mm6 C m 1
151 movd 8(%eax), %mm0 C m 1
152 movd %mm6, -4(%edx) C m 1
153 psrlq $32, %mm6 C m 1
154 L(m01): pmuludq %mm7, %mm0 C m 1
155 paddq %mm4, %mm6 C m 1
156 movd 12(%eax), %mm1 C m 1
157 movd %mm6, (%edx) C m 1
158 psrlq $32, %mm6 C m 1
159 pmuludq %mm7, %mm1 C m 1
160 paddq %mm3, %mm6 C m 1
161 movd 16(%eax), %mm4 C m 1
162 movd %mm6, 4(%edx) C m 1
163 psrlq $32, %mm6 C m 1
164 lea 16(%eax), %eax C m 1
165 lea 16(%edx), %edx C m 1
168 pmuludq %mm7, %mm4 C m 1
169 paddq %mm0, %mm6 C m 1
170 movd %mm6, -8(%edx) C m 1
171 psrlq $32, %mm6 C m 1
172 paddq %mm1, %mm6 C m 1
176 movd (%eax), %mm1 C m 2
177 lea (%ebx), %ecx C inner loop count m 2
178 pmuludq %mm7, %mm1 C m 2
179 movd 4(%eax), %mm4 C m 2
180 pmuludq %mm7, %mm4 C m 2
181 movd 8(%eax), %mm3 C m 2
185 pmuludq %mm7, %mm4 C m 2
186 paddq %mm0, %mm6 C m 2
187 movd 8(%eax), %mm3 C m 2
188 movd %mm6, -4(%edx) C m 2
189 psrlq $32, %mm6 C m 2
190 L(m10): pmuludq %mm7, %mm3 C m 2
191 paddq %mm1, %mm6 C m 2
192 movd 12(%eax), %mm0 C m 2
193 movd %mm6, (%edx) C m 2
194 psrlq $32, %mm6 C m 2
195 pmuludq %mm7, %mm0 C m 2
196 paddq %mm4, %mm6 C m 2
197 movd 16(%eax), %mm1 C m 2
198 movd %mm6, 4(%edx) C m 2
199 psrlq $32, %mm6 C m 2
200 pmuludq %mm7, %mm1 C m 2
201 paddq %mm3, %mm6 C m 2
202 movd 20(%eax), %mm4 C m 2
203 movd %mm6, 8(%edx) C m 2
204 psrlq $32, %mm6 C m 2
205 lea 16(%eax), %eax C m 2
206 lea 16(%edx), %edx C m 2
209 pmuludq %mm7, %mm4 C m 2
210 paddq %mm0, %mm6 C m 2
211 movd %mm6, -4(%edx) C m 2
212 psrlq $32, %mm6 C m 2
213 paddq %mm1, %mm6 C m 2
217 movd (%eax), %mm0 C m 3
218 lea (%ebx), %ecx C inner loop count m 3
219 pmuludq %mm7, %mm0 C m 3
220 movd 4(%eax), %mm1 C m 3
221 pmuludq %mm7, %mm1 C m 3
222 movd 8(%eax), %mm4 C m 3
226 pmuludq %mm7, %mm4 C m 3
227 paddq %mm0, %mm6 C m 3
228 movd 12(%eax), %mm3 C m 3
229 movd %mm6, (%edx) C m 3
230 psrlq $32, %mm6 C m 3
231 pmuludq %mm7, %mm3 C m 3
232 paddq %mm1, %mm6 C m 3
233 movd 16(%eax), %mm0 C m 3
234 movd %mm6, 4(%edx) C m 3
235 psrlq $32, %mm6 C m 3
236 pmuludq %mm7, %mm0 C m 3
237 paddq %mm4, %mm6 C m 3
238 movd 20(%eax), %mm1 C m 3
239 movd %mm6, 8(%edx) C m 3
240 psrlq $32, %mm6 C m 3
241 pmuludq %mm7, %mm1 C m 3
242 paddq %mm3, %mm6 C m 3
243 movd 24(%eax), %mm4 C m 3
244 movd %mm6, 12(%edx) C m 3
245 psrlq $32, %mm6 C m 3
246 lea 16(%eax), %eax C m 3
247 lea 16(%edx), %edx C m 3
250 pmuludq %mm7, %mm4 C m 3
251 paddq %mm0, %mm6 C m 3
252 movd %mm6, (%edx) C m 3
253 psrlq $32, %mm6 C m 3
254 paddq %mm1, %mm6 C m 3
258 movd (%eax), %mm3 C m 0
259 lea (%ebx), %ecx C inner loop count m 0
260 pmuludq %mm7, %mm3 C m 0
261 movd 4(%eax), %mm0 C m 0
262 pmuludq %mm7, %mm0 C m 0
263 movd 8(%eax), %mm1 C m 0
267 pmuludq %mm7, %mm4 C m 0
268 paddq %mm0, %mm6 C m 0
269 movd (%eax), %mm3 C m 0
270 movd %mm6, -12(%edx) C m 0
271 psrlq $32, %mm6 C m 0
272 pmuludq %mm7, %mm3 C m 0
273 paddq %mm1, %mm6 C m 0
274 movd 4(%eax), %mm0 C m 0
275 movd %mm6, -8(%edx) C m 0
276 psrlq $32, %mm6 C m 0
277 pmuludq %mm7, %mm0 C m 0
278 paddq %mm4, %mm6 C m 0
279 movd 8(%eax), %mm1 C m 0
280 movd %mm6, -4(%edx) C m 0
281 psrlq $32, %mm6 C m 0
282 L(m00): pmuludq %mm7, %mm1 C m 0
283 paddq %mm3, %mm6 C m 0
284 movd 12(%eax), %mm4 C m 0
285 movd %mm6, (%edx) C m 0
286 psrlq $32, %mm6 C m 0
287 lea 16(%eax), %eax C m 0
288 lea 16(%edx), %edx C m 0
291 pmuludq %mm7, %mm4 C m 0
292 paddq %mm0, %mm6 C m 0
293 movd %mm6, -12(%edx) C m 0
294 psrlq $32, %mm6 C m 0
295 paddq %mm1, %mm6 C m 0
299 lea 8(%edi), %edi C rp += 2
300 movd (%esi), %mm7 C am 3
301 mov %edi, %edx C rp2 = rp am 3
302 lea 4(%esi), %esi C up++ am 3
303 lea (%esi), %eax C up2 = up am 3
304 movd (%eax), %mm0 C am 3
305 lea (%ebx), %ecx C inner loop count am 3
306 pxor %mm6, %mm6 C am 3
307 pmuludq %mm7, %mm0 C am 3
308 movd 4(%eax), %mm1 C am 3
309 movd (%edx), %mm4 C am 3
310 pmuludq %mm7, %mm1 C am 3
311 movd 8(%eax), %mm2 C am 3
312 paddq %mm0, %mm4 C am 3
313 movd 4(%edx), %mm5 C am 3
317 pmuludq %mm7, %mm2 C am 3
318 paddq %mm4, %mm6 C am 3
319 movd 12(%eax), %mm3 C am 3
320 paddq %mm1, %mm5 C am 3
321 movd 8(%edx), %mm4 C am 3
322 movd %mm6, (%edx) C am 3
323 psrlq $32, %mm6 C am 3
324 pmuludq %mm7, %mm3 C am 3
325 paddq %mm5, %mm6 C am 3
326 movd 16(%eax), %mm0 C am 3
327 paddq %mm2, %mm4 C am 3
328 movd 12(%edx), %mm5 C am 3
329 movd %mm6, 4(%edx) C am 3
330 psrlq $32, %mm6 C am 3
331 pmuludq %mm7, %mm0 C am 3
332 paddq %mm4, %mm6 C am 3
333 movd 20(%eax), %mm1 C am 3
334 paddq %mm3, %mm5 C am 3
335 movd 16(%edx), %mm4 C am 3
336 movd %mm6, 8(%edx) C am 3
337 psrlq $32, %mm6 C am 3
338 pmuludq %mm7, %mm1 C am 3
339 paddq %mm5, %mm6 C am 3
340 movd 24(%eax), %mm2 C am 3
341 paddq %mm0, %mm4 C am 3
342 movd 20(%edx), %mm5 C am 3
343 movd %mm6, 12(%edx) C am 3
344 psrlq $32, %mm6 C am 3
345 lea 16(%eax), %eax C am 3
346 lea 16(%edx), %edx C am 3
349 pmuludq %mm7, %mm2 C am 3
350 paddq %mm4, %mm6 C am 3
351 paddq %mm1, %mm5 C am 3
352 movd 8(%edx), %mm4 C am 3
353 movd %mm6, (%edx) C am 3
354 psrlq $32, %mm6 C am 3
355 paddq %mm5, %mm6 C am 3
356 paddq %mm2, %mm4 C am 3
357 L(2): movd %mm6, 4(%edx) C am 3
358 psrlq $32, %mm6 C am 3
359 paddq %mm4, %mm6 C am 3
360 movd %mm6, 8(%edx) C am 3
361 psrlq $32, %mm6 C am 3
362 movd %mm6, 12(%edx) C am 3
364 lea 8(%edi), %edi C rp += 2
365 movd (%esi), %mm7 C am 2
366 mov %edi, %edx C rp2 = rp am 2
367 lea 4(%esi), %esi C up++ am 2
368 lea (%esi), %eax C up2 = up am 2
369 movd (%eax), %mm1 C am 2
370 lea (%ebx), %ecx C inner loop count am 2
371 pxor %mm6, %mm6 C am 2
372 pmuludq %mm7, %mm1 C am 2
373 movd 4(%eax), %mm2 C am 2
374 movd (%edx), %mm5 C am 2
375 pmuludq %mm7, %mm2 C am 2
376 movd 8(%eax), %mm3 C am 2
377 paddq %mm1, %mm5 C am 2
378 movd 4(%edx), %mm4 C am 2
382 pmuludq %mm7, %mm2 C am 2
383 paddq %mm4, %mm6 C am 2
384 movd 8(%eax), %mm3 C am 2
385 paddq %mm1, %mm5 C am 2
386 movd 4(%edx), %mm4 C am 2
387 movd %mm6, -4(%edx) C am 2
388 psrlq $32, %mm6 C am 2
390 pmuludq %mm7, %mm3 C am 2
391 paddq %mm5, %mm6 C am 2
392 movd 12(%eax), %mm0 C am 2
393 paddq %mm2, %mm4 C am 2
394 movd 8(%edx), %mm5 C am 2
395 movd %mm6, (%edx) C am 2
396 psrlq $32, %mm6 C am 2
397 pmuludq %mm7, %mm0 C am 2
398 paddq %mm4, %mm6 C am 2
399 movd 16(%eax), %mm1 C am 2
400 paddq %mm3, %mm5 C am 2
401 movd 12(%edx), %mm4 C am 2
402 movd %mm6, 4(%edx) C am 2
403 psrlq $32, %mm6 C am 2
404 pmuludq %mm7, %mm1 C am 2
405 paddq %mm5, %mm6 C am 2
406 movd 20(%eax), %mm2 C am 2
407 paddq %mm0, %mm4 C am 2
408 movd 16(%edx), %mm5 C am 2
409 movd %mm6, 8(%edx) C am 2
410 psrlq $32, %mm6 C am 2
411 lea 16(%eax), %eax C am 2
412 lea 16(%edx), %edx C am 2
415 pmuludq %mm7, %mm2 C am 2
416 paddq %mm4, %mm6 C am 2
417 paddq %mm1, %mm5 C am 2
418 movd 4(%edx), %mm4 C am 2
419 movd %mm6, -4(%edx) C am 2
420 psrlq $32, %mm6 C am 2
421 paddq %mm5, %mm6 C am 2
422 paddq %mm2, %mm4 C am 2
423 L(1): movd %mm6, (%edx) C am 2
424 psrlq $32, %mm6 C am 2
425 paddq %mm4, %mm6 C am 2
426 movd %mm6, 4(%edx) C am 2
427 psrlq $32, %mm6 C am 2
428 movd %mm6, 8(%edx) C am 2
430 lea 8(%edi), %edi C rp += 2
431 movd (%esi), %mm7 C am 1
432 mov %edi, %edx C rp2 = rp am 1
433 lea 4(%esi), %esi C up++ am 1
434 lea (%esi), %eax C up2 = up am 1
435 movd (%eax), %mm2 C am 1
436 lea (%ebx), %ecx C inner loop count am 1
437 pxor %mm6, %mm6 C am 1
438 pmuludq %mm7, %mm2 C am 1
439 movd 4(%eax), %mm3 C am 1
440 movd (%edx), %mm4 C am 1
441 pmuludq %mm7, %mm3 C am 1
442 movd 8(%eax), %mm0 C am 1
443 paddq %mm2, %mm4 C am 1
444 movd 4(%edx), %mm5 C am 1
448 pmuludq %mm7, %mm2 C am 1
449 paddq %mm4, %mm6 C am 1
450 movd 4(%eax), %mm3 C am 1
451 paddq %mm1, %mm5 C am 1
452 movd (%edx), %mm4 C am 1
453 movd %mm6, -8(%edx) C am 1
454 psrlq $32, %mm6 C am 1
455 pmuludq %mm7, %mm3 C am 1
456 paddq %mm5, %mm6 C am 1
457 movd 8(%eax), %mm0 C am 1
458 paddq %mm2, %mm4 C am 1
459 movd 4(%edx), %mm5 C am 1
460 movd %mm6, -4(%edx) C am 1
461 psrlq $32, %mm6 C am 1
463 pmuludq %mm7, %mm0 C am 1
464 paddq %mm4, %mm6 C am 1
465 movd 12(%eax), %mm1 C am 1
466 paddq %mm3, %mm5 C am 1
467 movd 8(%edx), %mm4 C am 1
468 movd %mm6, (%edx) C am 1
469 psrlq $32, %mm6 C am 1
470 pmuludq %mm7, %mm1 C am 1
471 paddq %mm5, %mm6 C am 1
472 movd 16(%eax), %mm2 C am 1
473 paddq %mm0, %mm4 C am 1
474 movd 12(%edx), %mm5 C am 1
475 movd %mm6, 4(%edx) C am 1
476 psrlq $32, %mm6 C am 1
477 lea 16(%eax), %eax C am 1
478 lea 16(%edx), %edx C am 1
481 pmuludq %mm7, %mm2 C am 1
482 paddq %mm4, %mm6 C am 1
483 paddq %mm1, %mm5 C am 1
484 movd (%edx), %mm4 C am 1
485 movd %mm6, -8(%edx) C am 1
486 psrlq $32, %mm6 C am 1
487 paddq %mm5, %mm6 C am 1
488 paddq %mm2, %mm4 C am 1
489 L(0): movd %mm6, -4(%edx) C am 1
490 psrlq $32, %mm6 C am 1
491 paddq %mm4, %mm6 C am 1
492 movd %mm6, (%edx) C am 1
493 psrlq $32, %mm6 C am 1
494 movd %mm6, 4(%edx) C am 1
496 lea 8(%edi), %edi C rp += 2
497 movd (%esi), %mm7 C am 0
498 mov %edi, %edx C rp2 = rp am 0
499 lea 4(%esi), %esi C up++ am 0
500 lea (%esi), %eax C up2 = up am 0
501 movd (%eax), %mm3 C am 0
502 lea (%ebx), %ecx C inner loop count am 0
503 pxor %mm6, %mm6 C am 0
504 pmuludq %mm7, %mm3 C am 0
505 movd 4(%eax), %mm0 C am 0
506 movd (%edx), %mm5 C am 0
507 pmuludq %mm7, %mm0 C am 0
508 movd 8(%eax), %mm1 C am 0
509 paddq %mm3, %mm5 C am 0
510 movd 4(%edx), %mm4 C am 0
514 pmuludq %mm7, %mm2 C am 0
515 paddq %mm4, %mm6 C am 0
516 movd (%eax), %mm3 C am 0
517 paddq %mm1, %mm5 C am 0
518 movd -4(%edx), %mm4 C am 0
519 movd %mm6, -12(%edx) C am 0
520 psrlq $32, %mm6 C am 0
521 pmuludq %mm7, %mm3 C am 0
522 paddq %mm5, %mm6 C am 0
523 movd 4(%eax), %mm0 C am 0
524 paddq %mm2, %mm4 C am 0
525 movd (%edx), %mm5 C am 0
526 movd %mm6, -8(%edx) C am 0
527 psrlq $32, %mm6 C am 0
528 pmuludq %mm7, %mm0 C am 0
529 paddq %mm4, %mm6 C am 0
530 movd 8(%eax), %mm1 C am 0
531 paddq %mm3, %mm5 C am 0
532 movd 4(%edx), %mm4 C am 0
533 movd %mm6, -4(%edx) C am 0
534 psrlq $32, %mm6 C am 0
536 pmuludq %mm7, %mm1 C am 0
537 paddq %mm5, %mm6 C am 0
538 movd 12(%eax), %mm2 C am 0
539 paddq %mm0, %mm4 C am 0
540 movd 8(%edx), %mm5 C am 0
541 movd %mm6, (%edx) C am 0
542 psrlq $32, %mm6 C am 0
543 lea 16(%eax), %eax C am 0
544 lea 16(%edx), %edx C am 0
547 pmuludq %mm7, %mm2 C am 0
548 paddq %mm4, %mm6 C am 0
549 paddq %mm1, %mm5 C am 0
550 movd -4(%edx), %mm4 C am 0
551 movd %mm6, -12(%edx) C am 0
552 psrlq $32, %mm6 C am 0
553 paddq %mm5, %mm6 C am 0
554 paddq %mm2, %mm4 C am 0
555 L(3): movd %mm6, -8(%edx) C am 0
556 psrlq $32, %mm6 C am 0
557 paddq %mm4, %mm6 C am 0
558 movd %mm6, -4(%edx) C am 0
559 psrlq $32, %mm6 C am 0
560 movd %mm6, (%edx) C am 0
570 L(am3): C up[un-1..un-3] x up[un-4]
571 lea 8(%edx), %edx C rp2 += 2
593 movd %mm4, 12(%edx) C FIXME feed through!
596 L(am2): C up[un-1..un-2] x up[un-3]
597 lea 8(%edx), %edx C rp2 += 2
612 movd %mm4, 8(%edx) C FIXME feed through!
615 L(am1): C up[un-1] x up[un-2]
616 lea 8(%edx), %edx C rp2 += 2
626 C *** diag stuff, use elementary code for now
628 mov 4(%esp), %edx C rp
629 mov 8(%esp), %eax C up
630 mov 12(%esp), %ecx C un
633 pmuludq %mm2, %mm2 C src[0]^2
638 movd 4(%edx), %mm3 C dst[1]
643 psllq $1, %mm3 C 2*dst[1]
651 movd 4(%eax), %mm0 C src limb
655 pand %mm0, %mm1 C diagonal low
656 psrlq $32, %mm0 C diagonal high
659 psllq $1, %mm3 C 2*dst[i]
666 psllq $1, %mm3 C 2*dst[i+1]
676 movd 4(%eax), %mm0 C src[size-1]
678 pand %mm0, %mm7 C diagonal low
679 psrlq $32, %mm0 C diagonal high
681 movd 8(%edx), %mm3 C dst[2*size-2]
689 movd %mm2, 12(%edx) C dst[2*size-1]