mpn/x86/k6/sqr_basecase.asm

   1 dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
  24 C     product (measured on the speed difference between 17 and 33 limbs,
  25 C     which is roughly the Karatsuba recursing range).
  26
  27
  28 dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
  29 dnl  code supports.  This value is used only by the tune program to know
  30 dnl  what it can go up to.  (An attempt to compile with a bigger value will
  31 dnl  trigger some m4_assert()s in the code, making the build fail.)
  32 dnl
  33 dnl  The value is determined by requiring the displacements in the unrolled
  34 dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
  35 dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
  36
  37 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
  38
  39
  40 dnl  Allow a value from the tune program to override config.m4.
  41
  42 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
  43 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
  44
  45
  46 dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
  47 dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
  48 dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
  49 dnl
  50 dnl  The first addmul is the biggest, and this takes the second least
  51 dnl  significant limb and multiplies it by the third least significant and
  52 dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
  53 dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
  54
  55 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
  56 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
  57
  58
  59 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  60 C
  61 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
  62 C lot of function call overheads are avoided, especially when the given size
  63 C is small.
  64 C
  65 C The code size might look a bit excessive, but not all of it is executed
  66 C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
  67 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
  68 C the unrolled addmul; and big sizes like 35x35 that do need all of it will
  69 C at least be getting value for money, because 35x35 spends something like
  70 C 5780 cycles here.
  71 C
  72 C Different values of UNROLL_COUNT give slightly different speeds, between
  73 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
  74 C This isn't a big difference, but it's presumably some alignment effect
  75 C which if understood could give a simple speedup.
  76
  77 defframe(PARAM_SIZE,12)
  78 defframe(PARAM_SRC, 8)
  79 defframe(PARAM_DST, 4)
  80
  81         TEXT
  82         ALIGN(32)
  83 PROLOGUE(mpn_sqr_basecase)
  84 deflit(`FRAME',0)
  85
  86         movl    PARAM_SIZE, %ecx
  87         movl    PARAM_SRC, %eax
  88
  89         cmpl    $2, %ecx
  90         je      L(two_limbs)
  91
  92         movl    PARAM_DST, %edx
  93         ja      L(three_or_more)
  94
  95
  96 C -----------------------------------------------------------------------------
  97 C one limb only
  98         C eax   src
  99         C ebx
 100         C ecx   size
 101         C edx   dst
 102
 103         movl    (%eax), %eax
 104         movl    %edx, %ecx
 105
 106         mull    %eax
 107
 108         movl    %eax, (%ecx)
 109         movl    %edx, 4(%ecx)
 110         ret
 111
 112
 113 C -----------------------------------------------------------------------------
 114         ALIGN(16)
 115 L(two_limbs):
 116         C eax   src
 117         C ebx
 118         C ecx   size
 119         C edx   dst
 120
 121         pushl   %ebx
 122         movl    %eax, %ebx      C src
 123 deflit(`FRAME',4)
 124
 125         movl    (%ebx), %eax
 126         movl    PARAM_DST, %ecx
 127
 128         mull    %eax            C src[0]^2
 129
 130         movl    %eax, (%ecx)
 131         movl    4(%ebx), %eax
 132
 133         movl    %edx, 4(%ecx)
 134
 135         mull    %eax            C src[1]^2
 136
 137         movl    %eax, 8(%ecx)
 138         movl    (%ebx), %eax
 139
 140         movl    %edx, 12(%ecx)
 141         movl    4(%ebx), %edx
 142
 143         mull    %edx            C src[0]*src[1]
 144
 145         addl    %eax, 4(%ecx)
 146
 147         adcl    %edx, 8(%ecx)
 148         adcl    $0, 12(%ecx)
 149
 150         popl    %ebx
 151         addl    %eax, 4(%ecx)
 152
 153         adcl    %edx, 8(%ecx)
 154         adcl    $0, 12(%ecx)
 155
 156         ret
 157
 158
 159 C -----------------------------------------------------------------------------
 160 L(three_or_more):
 161 deflit(`FRAME',0)
 162         cmpl    $4, %ecx
 163         jae     L(four_or_more)
 164
 165
 166 C -----------------------------------------------------------------------------
 167 C three limbs
 168         C eax   src
 169         C ecx   size
 170         C edx   dst
 171
 172         pushl   %ebx
 173         movl    %eax, %ebx      C src
 174
 175         movl    (%ebx), %eax
 176         movl    %edx, %ecx      C dst
 177
 178         mull    %eax            C src[0] ^ 2
 179
 180         movl    %eax, (%ecx)
 181         movl    4(%ebx), %eax
 182
 183         movl    %edx, 4(%ecx)
 184         pushl   %esi
 185
 186         mull    %eax            C src[1] ^ 2
 187
 188         movl    %eax, 8(%ecx)
 189         movl    8(%ebx), %eax
 190
 191         movl    %edx, 12(%ecx)
 192         pushl   %edi
 193
 194         mull    %eax            C src[2] ^ 2
 195
 196         movl    %eax, 16(%ecx)
 197         movl    (%ebx), %eax
 198
 199         movl    %edx, 20(%ecx)
 200         movl    4(%ebx), %edx
 201
 202         mull    %edx            C src[0] * src[1]
 203
 204         movl    %eax, %esi
 205         movl    (%ebx), %eax
 206
 207         movl    %edx, %edi
 208         movl    8(%ebx), %edx
 209
 210         pushl   %ebp
 211         xorl    %ebp, %ebp
 212
 213         mull    %edx            C src[0] * src[2]
 214
 215         addl    %eax, %edi
 216         movl    4(%ebx), %eax
 217
 218         adcl    %edx, %ebp
 219
 220         movl    8(%ebx), %edx
 221
 222         mull    %edx            C src[1] * src[2]
 223
 224         addl    %eax, %ebp
 225
 226         adcl    $0, %edx
 227
 228
 229         C eax   will be dst[5]
 230         C ebx
 231         C ecx   dst
 232         C edx   dst[4]
 233         C esi   dst[1]
 234         C edi   dst[2]
 235         C ebp   dst[3]
 236
 237         xorl    %eax, %eax
 238         addl    %esi, %esi
 239         adcl    %edi, %edi
 240         adcl    %ebp, %ebp
 241         adcl    %edx, %edx
 242         adcl    $0, %eax
 243
 244         addl    %esi, 4(%ecx)
 245         adcl    %edi, 8(%ecx)
 246         adcl    %ebp, 12(%ecx)
 247
 248         popl    %ebp
 249         popl    %edi
 250
 251         adcl    %edx, 16(%ecx)
 252
 253         popl    %esi
 254         popl    %ebx
 255
 256         adcl    %eax, 20(%ecx)
 257         ASSERT(nc)
 258
 259         ret
 260
 261
 262 C -----------------------------------------------------------------------------
 263
 264 defframe(SAVE_EBX,   -4)
 265 defframe(SAVE_ESI,   -8)
 266 defframe(SAVE_EDI,   -12)
 267 defframe(SAVE_EBP,   -16)
 268 defframe(VAR_COUNTER,-20)
 269 defframe(VAR_JMP,    -24)
 270 deflit(STACK_SPACE, 24)
 271
 272         ALIGN(16)
 273 L(four_or_more):
 274
 275         C eax   src
 276         C ebx
 277         C ecx   size
 278         C edx   dst
 279         C esi
 280         C edi
 281         C ebp
 282
 283 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 284 C
 285 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
 286 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
 287 C a 5780 cycle operation, which is not surprising since the loop here is 8
 288 C c/l and mpn_mul_1 is 6.25 c/l.
 289
 290         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
 291
 292         movl    %edi, SAVE_EDI
 293         leal    4(%edx), %edi
 294
 295         movl    %ebx, SAVE_EBX
 296         leal    4(%eax), %ebx
 297
 298         movl    %esi, SAVE_ESI
 299         xorl    %esi, %esi
 300
 301         movl    %ebp, SAVE_EBP
 302
 303         C eax
 304         C ebx   src+4
 305         C ecx   size
 306         C edx
 307         C esi
 308         C edi   dst+4
 309         C ebp
 310
 311         movl    (%eax), %ebp    C multiplier
 312         leal    -1(%ecx), %ecx  C size-1, and pad to a 16 byte boundary
 313
 314
 315         ALIGN(16)
 316 L(mul_1):
 317         C eax   scratch
 318         C ebx   src ptr
 319         C ecx   counter
 320         C edx   scratch
 321         C esi   carry
 322         C edi   dst ptr
 323         C ebp   multiplier
 324
 325         movl    (%ebx), %eax
 326         addl    $4, %ebx
 327
 328         mull    %ebp
 329
 330         addl    %esi, %eax
 331         movl    $0, %esi
 332
 333         adcl    %edx, %esi
 334
 335         movl    %eax, (%edi)
 336         addl    $4, %edi
 337
 338         loop    L(mul_1)
 339
 340
 341 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 342 C
 343 C The last two addmuls, which are the bottom right corner of the product
 344 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 345 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 346 C cases that need to be done.
 347 C
 348 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
 349 C comments.
 350 C
 351 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
 352 C
 353 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 354 C chunk each outer loop.
 355 C
 356 C K6 doesn't do any branch prediction on indirect jumps, which is good
 357 C actually because it's a different target each time.  The unrolled addmul
 358 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
 359 C the indirect jump is quickly recovered.
 360
 361
 362 dnl  This value is also implicitly encoded in a shift and add.
 363 dnl
 364 deflit(CODE_BYTES_PER_LIMB, 15)
 365
 366 dnl  With the unmodified &src[size] and &dst[size] pointers, the
 367 dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
 368 dnl  values up to 31.  Above that an offset must be added to them.
 369 dnl
 370 deflit(OFFSET,
 371 ifelse(eval(UNROLL_COUNT>31),1,
 372 eval((UNROLL_COUNT-31)*4),
 373 0))
 374
 375         C eax
 376         C ebx   &src[size]
 377         C ecx
 378         C edx
 379         C esi   carry
 380         C edi   &dst[size]
 381         C ebp
 382
 383         movl    PARAM_SIZE, %ecx
 384         movl    %esi, (%edi)
 385
 386         subl    $4, %ecx
 387         jz      L(corner)
 388
 389         movl    %ecx, %edx
 390 ifelse(OFFSET,0,,
 391 `       subl    $OFFSET, %ebx')
 392
 393         shll    $4, %ecx
 394 ifelse(OFFSET,0,,
 395 `       subl    $OFFSET, %edi')
 396
 397         negl    %ecx
 398
 399 ifdef(`PIC',`
 400         call    L(pic_calc)
 401 L(here):
 402 ',`
 403         leal    L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 404 ')
 405         negl    %edx
 406
 407
 408         C The calculated jump mustn't be before the start of the available
 409         C code.  This is the limitation UNROLL_COUNT puts on the src operand
 410         C size, but checked here using the jump address directly.
 411         C
 412         ASSERT(ae,`
 413         movl_text_address( L(unroll_inner_start), %eax)
 414         cmpl    %eax, %ecx
 415         ')
 416
 417
 418 C -----------------------------------------------------------------------------
 419         ALIGN(16)
 420 L(unroll_outer_top):
 421         C eax
 422         C ebx   &src[size], constant
 423         C ecx   VAR_JMP
 424         C edx   VAR_COUNTER, limbs, negative
 425         C esi   high limb to store
 426         C edi   dst ptr, high of last addmul
 427         C ebp
 428
 429         movl    -12+OFFSET(%ebx,%edx,4), %ebp   C multiplier
 430         movl    %edx, VAR_COUNTER
 431
 432         movl    -8+OFFSET(%ebx,%edx,4), %eax    C first limb of multiplicand
 433
 434         mull    %ebp
 435
 436         testb   $1, %cl
 437
 438         movl    %edx, %esi      C high carry
 439         movl    %ecx, %edx      C jump
 440
 441         movl    %eax, %ecx      C low carry
 442         leal    CODE_BYTES_PER_LIMB(%edx), %edx
 443
 444         movl    %edx, VAR_JMP
 445         leal    4(%edi), %edi
 446
 447         C A branch-free version of this using some xors was found to be a
 448         C touch slower than just a conditional jump, despite the jump
 449         C switching between taken and not taken on every loop.
 450
 451 ifelse(eval(UNROLL_COUNT%2),0,
 452         jz,jnz) L(unroll_noswap)
 453         movl    %esi, %eax      C high,low carry other way around
 454
 455         movl    %ecx, %esi
 456         movl    %eax, %ecx
 457 L(unroll_noswap):
 458
 459         jmp     *%edx
 460
 461
 462         C Must be on an even address here so the low bit of the jump address
 463         C will indicate which way around ecx/esi should start.
 464         C
 465         C An attempt was made at padding here to get the end of the unrolled
 466         C code to come out on a good alignment, to save padding before
 467         C L(corner).  This worked, but turned out to run slower than just an
 468         C ALIGN(2).  The reason for this is not clear, it might be related
 469         C to the different speeds on different UNROLL_COUNTs noted above.
 470
 471         ALIGN(2)
 472
 473 L(unroll_inner_start):
 474         C eax   scratch
 475         C ebx   src
 476         C ecx   carry low
 477         C edx   scratch
 478         C esi   carry high
 479         C edi   dst
 480         C ebp   multiplier
 481         C
 482         C 15 code bytes each limb
 483         C ecx/esi swapped on each chunk
 484
 485 forloop(`i', UNROLL_COUNT, 1, `
 486         deflit(`disp_src', eval(-i*4 + OFFSET))
 487         deflit(`disp_dst', eval(disp_src - 4))
 488
 489         m4_assert(`disp_src>=-128 && disp_src<128')
 490         m4_assert(`disp_dst>=-128 && disp_dst<128')
 491
 492 ifelse(eval(i%2),0,`
 493 Zdisp(  movl,   disp_src,(%ebx), %eax)
 494         mull    %ebp
 495 Zdisp(  addl,   %esi, disp_dst,(%edi))
 496         adcl    %eax, %ecx
 497         movl    %edx, %esi
 498         jadcl0( %esi)
 499 ',`
 500         dnl  this one comes out last
 501 Zdisp(  movl,   disp_src,(%ebx), %eax)
 502         mull    %ebp
 503 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 504         adcl    %eax, %esi
 505         movl    %edx, %ecx
 506         jadcl0( %ecx)
 507 ')
 508 ')
 509 L(unroll_inner_end):
 510
 511         addl    %esi, -4+OFFSET(%edi)
 512
 513         movl    VAR_COUNTER, %edx
 514         jadcl0( %ecx)
 515
 516         movl    %ecx, m4_empty_if_zero(OFFSET)(%edi)
 517         movl    VAR_JMP, %ecx
 518
 519         incl    %edx
 520         jnz     L(unroll_outer_top)
 521
 522
 523 ifelse(OFFSET,0,,`
 524         addl    $OFFSET, %ebx
 525         addl    $OFFSET, %edi
 526 ')
 527
 528
 529 C -----------------------------------------------------------------------------
 530         ALIGN(16)
 531 L(corner):
 532         C ebx   &src[size]
 533         C edi   &dst[2*size-5]
 534
 535         movl    -12(%ebx), %ebp
 536
 537         movl    -8(%ebx), %eax
 538         movl    %eax, %ecx
 539
 540         mull    %ebp
 541
 542         addl    %eax, -4(%edi)
 543         adcl    $0, %edx
 544
 545         movl    -4(%ebx), %eax
 546         movl    %edx, %esi
 547         movl    %eax, %ebx
 548
 549         mull    %ebp
 550
 551         addl    %esi, %eax
 552         adcl    $0, %edx
 553
 554         addl    %eax, (%edi)
 555         adcl    $0, %edx
 556
 557         movl    %edx, %esi
 558         movl    %ebx, %eax
 559
 560         mull    %ecx
 561
 562         addl    %esi, %eax
 563         movl    %eax, 4(%edi)
 564
 565         adcl    $0, %edx
 566
 567         movl    %edx, 8(%edi)
 568
 569
 570 C -----------------------------------------------------------------------------
 571 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
 572 C The loop measures about 6 cycles/iteration, though it looks like it should
 573 C decode in 5.
 574
 575 L(lshift_start):
 576         movl    PARAM_SIZE, %ecx
 577
 578         movl    PARAM_DST, %edi
 579         subl    $1, %ecx                C size-1 and clear carry
 580
 581         movl    PARAM_SRC, %ebx
 582         movl    %ecx, %edx
 583
 584         xorl    %eax, %eax              C ready for adcl
 585
 586
 587         ALIGN(16)
 588 L(lshift):
 589         C eax
 590         C ebx   src (for later use)
 591         C ecx   counter, decrementing
 592         C edx   size-1 (for later use)
 593         C esi
 594         C edi   dst, incrementing
 595         C ebp
 596
 597         rcll    4(%edi)
 598         rcll    8(%edi)
 599         leal    8(%edi), %edi
 600         loop    L(lshift)
 601
 602
 603         adcl    %eax, %eax
 604
 605         movl    %eax, 4(%edi)           C dst most significant limb
 606         movl    (%ebx), %eax            C src[0]
 607
 608         leal    4(%ebx,%edx,4), %ebx    C &src[size]
 609         subl    %edx, %ecx              C -(size-1)
 610
 611
 612 C -----------------------------------------------------------------------------
 613 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 614 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 615 C low limb of src[0]^2.
 616
 617
 618         mull    %eax
 619
 620         movl    %eax, (%edi,%ecx,8)     C dst[0]
 621
 622
 623         ALIGN(16)
 624 L(diag):
 625         C eax   scratch
 626         C ebx   &src[size]
 627         C ecx   counter, negative
 628         C edx   carry
 629         C esi   scratch
 630         C edi   dst[2*size-2]
 631         C ebp
 632
 633         movl    (%ebx,%ecx,4), %eax
 634         movl    %edx, %esi
 635
 636         mull    %eax
 637
 638         addl    %esi, 4(%edi,%ecx,8)
 639         adcl    %eax, 8(%edi,%ecx,8)
 640         adcl    $0, %edx
 641
 642         incl    %ecx
 643         jnz     L(diag)
 644
 645
 646         movl    SAVE_EBX, %ebx
 647         movl    SAVE_ESI, %esi
 648
 649         addl    %edx, 4(%edi)           C dst most significant limb
 650
 651         movl    SAVE_EDI, %edi
 652         movl    SAVE_EBP, %ebp
 653         addl    $FRAME, %esp
 654         ret
 655
 656
 657
 658 C -----------------------------------------------------------------------------
 659 ifdef(`PIC',`
 660 L(pic_calc):
 661         C See mpn/x86/README about old gas bugs
 662         addl    (%esp), %ecx
 663         addl    $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
 664         addl    %edx, %ecx
 665         ret_internal
 666 ')
 667
 668
 669 EPILOGUE()