mpn/x86/p6/sqr_basecase.asm

   1 dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
  24 C     product (measured on the speed difference between 20 and 40 limbs,
  25 C     which is the Karatsuba recursing range).
  26
  27
  28 dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
  29 dnl  a description.  The only difference here is that UNROLL_COUNT can go up
  30 dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
  31
  32 deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
  33
  34 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
  35 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
  36
  37 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
  38 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
  39
  40
  41 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  42 C
  43 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  44 C lot of function call overheads are avoided, especially when the given size
  45 C is small.
  46 C
  47 C The code size might look a bit excessive, but not all of it is executed so
  48 C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
  49 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
  50 C the unrolled addmul; and big sizes like 40x40 that do use the full
  51 C unrolling will least be making good use of it, because 40x40 will take
  52 C something like 7000 cycles.
  53
  54 defframe(PARAM_SIZE,12)
  55 defframe(PARAM_SRC, 8)
  56 defframe(PARAM_DST, 4)
  57
  58         TEXT
  59         ALIGN(32)
  60 PROLOGUE(mpn_sqr_basecase)
  61 deflit(`FRAME',0)
  62
  63         movl    PARAM_SIZE, %edx
  64
  65         movl    PARAM_SRC, %eax
  66
  67         cmpl    $2, %edx
  68         movl    PARAM_DST, %ecx
  69         je      L(two_limbs)
  70
  71         movl    (%eax), %eax
  72         ja      L(three_or_more)
  73
  74
  75 C -----------------------------------------------------------------------------
  76 C one limb only
  77         C eax   src limb
  78         C ebx
  79         C ecx   dst
  80         C edx
  81
  82         mull    %eax
  83
  84         movl    %eax, (%ecx)
  85         movl    %edx, 4(%ecx)
  86
  87         ret
  88
  89
  90 C -----------------------------------------------------------------------------
  91 L(two_limbs):
  92         C eax   src
  93         C ebx
  94         C ecx   dst
  95         C edx
  96
  97 defframe(SAVE_ESI, -4)
  98 defframe(SAVE_EBX, -8)
  99 defframe(SAVE_EDI, -12)
 100 defframe(SAVE_EBP, -16)
 101 deflit(`STACK_SPACE',16)
 102
 103         subl    $STACK_SPACE, %esp
 104 deflit(`FRAME',STACK_SPACE)
 105
 106         movl    %esi, SAVE_ESI
 107         movl    %eax, %esi
 108         movl    (%eax), %eax
 109
 110         mull    %eax            C src[0]^2
 111
 112         movl    %eax, (%ecx)    C dst[0]
 113         movl    4(%esi), %eax
 114
 115         movl    %ebx, SAVE_EBX
 116         movl    %edx, %ebx      C dst[1]
 117
 118         mull    %eax            C src[1]^2
 119
 120         movl    %edi, SAVE_EDI
 121         movl    %eax, %edi      C dst[2]
 122         movl    (%esi), %eax
 123
 124         movl    %ebp, SAVE_EBP
 125         movl    %edx, %ebp      C dst[3]
 126
 127         mull    4(%esi)         C src[0]*src[1]
 128
 129         addl    %eax, %ebx
 130         movl    SAVE_ESI, %esi
 131
 132         adcl    %edx, %edi
 133
 134         adcl    $0, %ebp
 135         addl    %ebx, %eax
 136         movl    SAVE_EBX, %ebx
 137
 138         adcl    %edi, %edx
 139         movl    SAVE_EDI, %edi
 140
 141         adcl    $0, %ebp
 142
 143         movl    %eax, 4(%ecx)
 144
 145         movl    %ebp, 12(%ecx)
 146         movl    SAVE_EBP, %ebp
 147
 148         movl    %edx, 8(%ecx)
 149         addl    $FRAME, %esp
 150
 151         ret
 152
 153
 154 C -----------------------------------------------------------------------------
 155 L(three_or_more):
 156         C eax   src low limb
 157         C ebx
 158         C ecx   dst
 159         C edx   size
 160 deflit(`FRAME',0)
 161
 162         pushl   %esi    defframe_pushl(`SAVE_ESI')
 163         cmpl    $4, %edx
 164
 165         movl    PARAM_SRC, %esi
 166         jae     L(four_or_more)
 167
 168
 169 C -----------------------------------------------------------------------------
 170 C three limbs
 171
 172         C eax   src low limb
 173         C ebx
 174         C ecx   dst
 175         C edx
 176         C esi   src
 177         C edi
 178         C ebp
 179
 180         pushl   %ebp    defframe_pushl(`SAVE_EBP')
 181         pushl   %edi    defframe_pushl(`SAVE_EDI')
 182
 183         mull    %eax            C src[0] ^ 2
 184
 185         movl    %eax, (%ecx)
 186         movl    %edx, 4(%ecx)
 187
 188         movl    4(%esi), %eax
 189         xorl    %ebp, %ebp
 190
 191         mull    %eax            C src[1] ^ 2
 192
 193         movl    %eax, 8(%ecx)
 194         movl    %edx, 12(%ecx)
 195         movl    8(%esi), %eax
 196
 197         pushl   %ebx    defframe_pushl(`SAVE_EBX')
 198
 199         mull    %eax            C src[2] ^ 2
 200
 201         movl    %eax, 16(%ecx)
 202         movl    %edx, 20(%ecx)
 203
 204         movl    (%esi), %eax
 205
 206         mull    4(%esi)         C src[0] * src[1]
 207
 208         movl    %eax, %ebx
 209         movl    %edx, %edi
 210
 211         movl    (%esi), %eax
 212
 213         mull    8(%esi)         C src[0] * src[2]
 214
 215         addl    %eax, %edi
 216         movl    %edx, %ebp
 217
 218         adcl    $0, %ebp
 219         movl    4(%esi), %eax
 220
 221         mull    8(%esi)         C src[1] * src[2]
 222
 223         xorl    %esi, %esi
 224         addl    %eax, %ebp
 225
 226         C eax
 227         C ebx   dst[1]
 228         C ecx   dst
 229         C edx   dst[4]
 230         C esi   zero, will be dst[5]
 231         C edi   dst[2]
 232         C ebp   dst[3]
 233
 234         adcl    $0, %edx
 235         addl    %ebx, %ebx
 236
 237         adcl    %edi, %edi
 238
 239         adcl    %ebp, %ebp
 240
 241         adcl    %edx, %edx
 242         movl    4(%ecx), %eax
 243
 244         adcl    $0, %esi
 245         addl    %ebx, %eax
 246
 247         movl    %eax, 4(%ecx)
 248         movl    8(%ecx), %eax
 249
 250         adcl    %edi, %eax
 251         movl    12(%ecx), %ebx
 252
 253         adcl    %ebp, %ebx
 254         movl    16(%ecx), %edi
 255
 256         movl    %eax, 8(%ecx)
 257         movl    SAVE_EBP, %ebp
 258
 259         movl    %ebx, 12(%ecx)
 260         movl    SAVE_EBX, %ebx
 261
 262         adcl    %edx, %edi
 263         movl    20(%ecx), %eax
 264
 265         movl    %edi, 16(%ecx)
 266         movl    SAVE_EDI, %edi
 267
 268         adcl    %esi, %eax      C no carry out of this
 269         movl    SAVE_ESI, %esi
 270
 271         movl    %eax, 20(%ecx)
 272         addl    $FRAME, %esp
 273
 274         ret
 275
 276
 277
 278 C -----------------------------------------------------------------------------
 279 defframe(VAR_COUNTER,-20)
 280 defframe(VAR_JMP,    -24)
 281 deflit(`STACK_SPACE',24)
 282
 283 L(four_or_more):
 284         C eax   src low limb
 285         C ebx
 286         C ecx
 287         C edx   size
 288         C esi   src
 289         C edi
 290         C ebp
 291 deflit(`FRAME',4)  dnl  %esi already pushed
 292
 293 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 294
 295         subl    $STACK_SPACE-FRAME, %esp
 296 deflit(`FRAME',STACK_SPACE)
 297         movl    $1, %ecx
 298
 299         movl    %edi, SAVE_EDI
 300         movl    PARAM_DST, %edi
 301
 302         movl    %ebx, SAVE_EBX
 303         subl    %edx, %ecx              C -(size-1)
 304
 305         movl    %ebp, SAVE_EBP
 306         movl    $0, %ebx                C initial carry
 307
 308         leal    (%esi,%edx,4), %esi     C &src[size]
 309         movl    %eax, %ebp              C multiplier
 310
 311         leal    -4(%edi,%edx,4), %edi   C &dst[size-1]
 312
 313
 314 C This loop runs at just over 6 c/l.
 315
 316 L(mul_1):
 317         C eax   scratch
 318         C ebx   carry
 319         C ecx   counter, limbs, negative, -(size-1) to -1
 320         C edx   scratch
 321         C esi   &src[size]
 322         C edi   &dst[size-1]
 323         C ebp   multiplier
 324
 325         movl    %ebp, %eax
 326
 327         mull    (%esi,%ecx,4)
 328
 329         addl    %ebx, %eax
 330         movl    $0, %ebx
 331
 332         adcl    %edx, %ebx
 333         movl    %eax, 4(%edi,%ecx,4)
 334
 335         incl    %ecx
 336         jnz     L(mul_1)
 337
 338
 339         movl    %ebx, 4(%edi)
 340
 341
 342 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 343 C
 344 C The last two addmuls, which are the bottom right corner of the product
 345 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 346 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 347 C cases that need to be done.
 348 C
 349 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
 350 C comments.
 351 C
 352 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
 353 C
 354 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 355 C chunk each outer loop.
 356
 357 dnl  This is also hard-coded in the address calculation below.
 358 deflit(CODE_BYTES_PER_LIMB, 15)
 359
 360 dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
 361 dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
 362 dnl  that an offset must be added to them.
 363 deflit(OFFSET,
 364 ifelse(eval(UNROLL_COUNT>32),1,
 365 eval((UNROLL_COUNT-32)*4),
 366 0))
 367
 368         C eax
 369         C ebx   carry
 370         C ecx
 371         C edx
 372         C esi   &src[size]
 373         C edi   &dst[size-1]
 374         C ebp
 375
 376         movl    PARAM_SIZE, %ecx
 377
 378         subl    $4, %ecx
 379         jz      L(corner)
 380
 381         movl    %ecx, %edx
 382         negl    %ecx
 383
 384         shll    $4, %ecx
 385 ifelse(OFFSET,0,,`subl  $OFFSET, %esi')
 386
 387 ifdef(`PIC',`
 388         call    L(pic_calc)
 389 L(here):
 390 ',`
 391         leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 392 ')
 393         negl    %edx
 394
 395 ifelse(OFFSET,0,,`subl  $OFFSET, %edi')
 396
 397         C The calculated jump mustn't be before the start of the available
 398         C code.  This is the limit that UNROLL_COUNT puts on the src operand
 399         C size, but checked here using the jump address directly.
 400
 401         ASSERT(ae,
 402         `movl_text_address( L(unroll_inner_start), %eax)
 403         cmpl    %eax, %ecx')
 404
 405
 406 C -----------------------------------------------------------------------------
 407         ALIGN(16)
 408 L(unroll_outer_top):
 409         C eax
 410         C ebx   high limb to store
 411         C ecx   VAR_JMP
 412         C edx   VAR_COUNTER, limbs, negative
 413         C esi   &src[size], constant
 414         C edi   dst ptr, second highest limb of last addmul
 415         C ebp
 416
 417         movl    -12+OFFSET(%esi,%edx,4), %ebp   C multiplier
 418         movl    %edx, VAR_COUNTER
 419
 420         movl    -8+OFFSET(%esi,%edx,4), %eax    C first limb of multiplicand
 421
 422         mull    %ebp
 423
 424 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
 425
 426         testb   $1, %cl
 427
 428         movl    %edx, %ebx      C high carry
 429         leal    4(%edi), %edi
 430
 431         movl    %ecx, %edx      C jump
 432
 433         movl    %eax, %ecx      C low carry
 434         leal    CODE_BYTES_PER_LIMB(%edx), %edx
 435
 436         cmovX(  %ebx, %ecx)     C high carry reverse
 437         cmovX(  %eax, %ebx)     C low carry reverse
 438         movl    %edx, VAR_JMP
 439         jmp     *%edx
 440
 441
 442         C Must be on an even address here so the low bit of the jump address
 443         C will indicate which way around ecx/ebx should start.
 444
 445         ALIGN(2)
 446
 447 L(unroll_inner_start):
 448         C eax   scratch
 449         C ebx   carry high
 450         C ecx   carry low
 451         C edx   scratch
 452         C esi   src pointer
 453         C edi   dst pointer
 454         C ebp   multiplier
 455         C
 456         C 15 code bytes each limb
 457         C ecx/ebx reversed on each chunk
 458
 459 forloop(`i', UNROLL_COUNT, 1, `
 460         deflit(`disp_src', eval(-i*4 + OFFSET))
 461         deflit(`disp_dst', eval(disp_src))
 462
 463         m4_assert(`disp_src>=-128 && disp_src<128')
 464         m4_assert(`disp_dst>=-128 && disp_dst<128')
 465
 466 ifelse(eval(i%2),0,`
 467 Zdisp(  movl,   disp_src,(%esi), %eax)
 468         mull    %ebp
 469 Zdisp(  addl,   %ebx, disp_dst,(%edi))
 470         adcl    %eax, %ecx
 471         movl    %edx, %ebx
 472         adcl    $0, %ebx
 473 ',`
 474         dnl  this one comes out last
 475 Zdisp(  movl,   disp_src,(%esi), %eax)
 476         mull    %ebp
 477 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 478         adcl    %eax, %ebx
 479         movl    %edx, %ecx
 480         adcl    $0, %ecx
 481 ')
 482 ')
 483 L(unroll_inner_end):
 484
 485         addl    %ebx, m4_empty_if_zero(OFFSET)(%edi)
 486
 487         movl    VAR_COUNTER, %edx
 488         adcl    $0, %ecx
 489
 490         movl    %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
 491         movl    VAR_JMP, %ecx
 492
 493         incl    %edx
 494         jnz     L(unroll_outer_top)
 495
 496
 497 ifelse(OFFSET,0,,`
 498         addl    $OFFSET, %esi
 499         addl    $OFFSET, %edi
 500 ')
 501
 502
 503 C -----------------------------------------------------------------------------
 504         ALIGN(16)
 505 L(corner):
 506         C eax
 507         C ebx
 508         C ecx
 509         C edx
 510         C esi   &src[size]
 511         C edi   &dst[2*size-5]
 512         C ebp
 513
 514         movl    -12(%esi), %eax
 515
 516         mull    -8(%esi)
 517
 518         addl    %eax, (%edi)
 519         movl    -12(%esi), %eax
 520         movl    $0, %ebx
 521
 522         adcl    %edx, %ebx
 523
 524         mull    -4(%esi)
 525
 526         addl    %eax, %ebx
 527         movl    -8(%esi), %eax
 528
 529         adcl    $0, %edx
 530
 531         addl    %ebx, 4(%edi)
 532         movl    $0, %ebx
 533
 534         adcl    %edx, %ebx
 535
 536         mull    -4(%esi)
 537
 538         movl    PARAM_SIZE, %ecx
 539         addl    %ebx, %eax
 540
 541         adcl    $0, %edx
 542
 543         movl    %eax, 8(%edi)
 544
 545         movl    %edx, 12(%edi)
 546         movl    PARAM_DST, %edi
 547
 548
 549 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
 550
 551         subl    $1, %ecx                C size-1
 552         xorl    %eax, %eax              C ready for final adcl, and clear carry
 553
 554         movl    %ecx, %edx
 555         movl    PARAM_SRC, %esi
 556
 557
 558 L(lshift):
 559         C eax
 560         C ebx
 561         C ecx   counter, size-1 to 1
 562         C edx   size-1 (for later use)
 563         C esi   src (for later use)
 564         C edi   dst, incrementing
 565         C ebp
 566
 567         rcll    4(%edi)
 568         rcll    8(%edi)
 569
 570         leal    8(%edi), %edi
 571         decl    %ecx
 572         jnz     L(lshift)
 573
 574
 575         adcl    %eax, %eax
 576
 577         movl    %eax, 4(%edi)           C dst most significant limb
 578         movl    (%esi), %eax            C src[0]
 579
 580         leal    4(%esi,%edx,4), %esi    C &src[size]
 581         subl    %edx, %ecx              C -(size-1)
 582
 583
 584 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 585 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 586 C low limb of src[0]^2.
 587
 588
 589         mull    %eax
 590
 591         movl    %eax, (%edi,%ecx,8)     C dst[0]
 592
 593
 594 L(diag):
 595         C eax   scratch
 596         C ebx   scratch
 597         C ecx   counter, negative
 598         C edx   carry
 599         C esi   &src[size]
 600         C edi   dst[2*size-2]
 601         C ebp
 602
 603         movl    (%esi,%ecx,4), %eax
 604         movl    %edx, %ebx
 605
 606         mull    %eax
 607
 608         addl    %ebx, 4(%edi,%ecx,8)
 609         adcl    %eax, 8(%edi,%ecx,8)
 610         adcl    $0, %edx
 611
 612         incl    %ecx
 613         jnz     L(diag)
 614
 615
 616         movl    SAVE_ESI, %esi
 617         movl    SAVE_EBX, %ebx
 618
 619         addl    %edx, 4(%edi)           C dst most significant limb
 620
 621         movl    SAVE_EDI, %edi
 622         movl    SAVE_EBP, %ebp
 623         addl    $FRAME, %esp
 624         ret
 625
 626
 627
 628 C -----------------------------------------------------------------------------
 629 ifdef(`PIC',`
 630 L(pic_calc):
 631         addl    (%esp), %ecx
 632         addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
 633         addl    %edx, %ecx
 634         ret_internal
 635 ')
 636
 637
 638 EPILOGUE()