mpn/x86/k7/sqr_basecase.asm

   1 dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
  24 C     (measured on the speed difference between 25 and 50 limbs, which is
  25 C     roughly the Karatsuba recursing range).
  26
  27
  28 dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
  29 dnl  some comments.
  30
  31 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
  32
  33 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
  34 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
  35
  36 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
  37 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
  38
  39
  40 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  41 C
  42 C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
  43 C which is quite a bit, but is considered good value since squares big
  44 C enough to use most of the code will be spending quite a few cycles in it.
  45
  46
  47 defframe(PARAM_SIZE,12)
  48 defframe(PARAM_SRC, 8)
  49 defframe(PARAM_DST, 4)
  50
  51         TEXT
  52         ALIGN(32)
  53 PROLOGUE(mpn_sqr_basecase)
  54 deflit(`FRAME',0)
  55
  56         movl    PARAM_SIZE, %ecx
  57         movl    PARAM_SRC, %eax
  58         cmpl    $2, %ecx
  59
  60         movl    PARAM_DST, %edx
  61         je      L(two_limbs)
  62         ja      L(three_or_more)
  63
  64
  65 C------------------------------------------------------------------------------
  66 C one limb only
  67         C eax   src
  68         C ecx   size
  69         C edx   dst
  70
  71         movl    (%eax), %eax
  72         movl    %edx, %ecx
  73
  74         mull    %eax
  75
  76         movl    %edx, 4(%ecx)
  77         movl    %eax, (%ecx)
  78         ret
  79
  80
  81 C------------------------------------------------------------------------------
  82 C
  83 C Using the read/modify/write "add"s seems to be faster than saving and
  84 C restoring registers.  Perhaps the loads for the first set hide under the
  85 C mul latency and the second gets store to load forwarding.
  86
  87         ALIGN(16)
  88 L(two_limbs):
  89         C eax   src
  90         C ebx
  91         C ecx   size
  92         C edx   dst
  93 deflit(`FRAME',0)
  94
  95         pushl   %ebx            FRAME_pushl()
  96         movl    %eax, %ebx      C src
  97         movl    (%eax), %eax
  98
  99         movl    %edx, %ecx      C dst
 100
 101         mull    %eax            C src[0]^2
 102
 103         movl    %eax, (%ecx)    C dst[0]
 104         movl    4(%ebx), %eax
 105
 106         movl    %edx, 4(%ecx)   C dst[1]
 107
 108         mull    %eax            C src[1]^2
 109
 110         movl    %eax, 8(%ecx)   C dst[2]
 111         movl    (%ebx), %eax
 112
 113         movl    %edx, 12(%ecx)  C dst[3]
 114
 115         mull    4(%ebx)         C src[0]*src[1]
 116
 117         popl    %ebx
 118
 119         addl    %eax, 4(%ecx)
 120         adcl    %edx, 8(%ecx)
 121         adcl    $0, 12(%ecx)
 122         ASSERT(nc)
 123
 124         addl    %eax, 4(%ecx)
 125         adcl    %edx, 8(%ecx)
 126         adcl    $0, 12(%ecx)
 127         ASSERT(nc)
 128
 129         ret
 130
 131
 132 C------------------------------------------------------------------------------
 133 defframe(SAVE_EBX,  -4)
 134 defframe(SAVE_ESI,  -8)
 135 defframe(SAVE_EDI, -12)
 136 defframe(SAVE_EBP, -16)
 137 deflit(STACK_SPACE, 16)
 138
 139 L(three_or_more):
 140         subl    $STACK_SPACE, %esp
 141         cmpl    $4, %ecx
 142         jae     L(four_or_more)
 143 deflit(`FRAME',STACK_SPACE)
 144
 145
 146 C------------------------------------------------------------------------------
 147 C Three limbs
 148 C
 149 C Writing out the loads and stores separately at the end of this code comes
 150 C out about 10 cycles faster than using adcls to memory.
 151
 152         C eax   src
 153         C ecx   size
 154         C edx   dst
 155
 156         movl    %ebx, SAVE_EBX
 157         movl    %eax, %ebx      C src
 158         movl    (%eax), %eax
 159
 160         movl    %edx, %ecx      C dst
 161         movl    %esi, SAVE_ESI
 162         movl    %edi, SAVE_EDI
 163
 164         mull    %eax            C src[0] ^ 2
 165
 166         movl    %eax, (%ecx)
 167         movl    4(%ebx), %eax
 168         movl    %edx, 4(%ecx)
 169
 170         mull    %eax            C src[1] ^ 2
 171
 172         movl    %eax, 8(%ecx)
 173         movl    8(%ebx), %eax
 174         movl    %edx, 12(%ecx)
 175
 176         mull    %eax            C src[2] ^ 2
 177
 178         movl    %eax, 16(%ecx)
 179         movl    (%ebx), %eax
 180         movl    %edx, 20(%ecx)
 181
 182         mull    4(%ebx)         C src[0] * src[1]
 183
 184         movl    %eax, %esi
 185         movl    (%ebx), %eax
 186         movl    %edx, %edi
 187
 188         mull    8(%ebx)         C src[0] * src[2]
 189
 190         addl    %eax, %edi
 191         movl    %ebp, SAVE_EBP
 192         movl    $0, %ebp
 193
 194         movl    4(%ebx), %eax
 195         adcl    %edx, %ebp
 196
 197         mull    8(%ebx)         C src[1] * src[2]
 198
 199         xorl    %ebx, %ebx
 200         addl    %eax, %ebp
 201
 202         adcl    $0, %edx
 203
 204         C eax
 205         C ebx   zero, will be dst[5]
 206         C ecx   dst
 207         C edx   dst[4]
 208         C esi   dst[1]
 209         C edi   dst[2]
 210         C ebp   dst[3]
 211
 212         adcl    $0, %edx
 213         addl    %esi, %esi
 214
 215         adcl    %edi, %edi
 216         movl    4(%ecx), %eax
 217
 218         adcl    %ebp, %ebp
 219
 220         adcl    %edx, %edx
 221
 222         adcl    $0, %ebx
 223         addl    %eax, %esi
 224         movl    8(%ecx), %eax
 225
 226         adcl    %eax, %edi
 227         movl    12(%ecx), %eax
 228         movl    %esi, 4(%ecx)
 229
 230         adcl    %eax, %ebp
 231         movl    16(%ecx), %eax
 232         movl    %edi, 8(%ecx)
 233
 234         movl    SAVE_ESI, %esi
 235         movl    SAVE_EDI, %edi
 236
 237         adcl    %eax, %edx
 238         movl    20(%ecx), %eax
 239         movl    %ebp, 12(%ecx)
 240
 241         adcl    %ebx, %eax
 242         ASSERT(nc)
 243         movl    SAVE_EBX, %ebx
 244         movl    SAVE_EBP, %ebp
 245
 246         movl    %edx, 16(%ecx)
 247         movl    %eax, 20(%ecx)
 248         addl    $FRAME, %esp
 249
 250         ret
 251
 252
 253 C------------------------------------------------------------------------------
 254 L(four_or_more):
 255
 256 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 257 C Further products are added in rather than stored.
 258
 259         C eax   src
 260         C ebx
 261         C ecx   size
 262         C edx   dst
 263         C esi
 264         C edi
 265         C ebp
 266
 267 defframe(`VAR_COUNTER',-20)
 268 defframe(`VAR_JMP',    -24)
 269 deflit(EXTRA_STACK_SPACE, 8)
 270
 271         movl    %ebx, SAVE_EBX
 272         movl    %edi, SAVE_EDI
 273         leal    (%edx,%ecx,4), %edi     C &dst[size]
 274
 275         movl    %esi, SAVE_ESI
 276         movl    %ebp, SAVE_EBP
 277         leal    (%eax,%ecx,4), %esi     C &src[size]
 278
 279         movl    (%eax), %ebp            C multiplier
 280         movl    $0, %ebx
 281         decl    %ecx
 282
 283         negl    %ecx
 284         subl    $EXTRA_STACK_SPACE, %esp
 285 FRAME_subl_esp(EXTRA_STACK_SPACE)
 286
 287 L(mul_1):
 288         C eax   scratch
 289         C ebx   carry
 290         C ecx   counter
 291         C edx   scratch
 292         C esi   &src[size]
 293         C edi   &dst[size]
 294         C ebp   multiplier
 295
 296         movl    (%esi,%ecx,4), %eax
 297
 298         mull    %ebp
 299
 300         addl    %ebx, %eax
 301         movl    %eax, (%edi,%ecx,4)
 302         movl    $0, %ebx
 303
 304         adcl    %edx, %ebx
 305         incl    %ecx
 306         jnz     L(mul_1)
 307
 308
 309 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
 310 C
 311 C The last two products, which are the bottom right corner of the product
 312 C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
 313 C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
 314 C cases that need to be done.
 315 C
 316 C The unrolled code is the same as in mpn_addmul_1, see that routine for
 317 C some comments.
 318 C
 319 C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
 320 C
 321 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
 322 C chunk each outer loop.
 323 C
 324 C K7 does branch prediction on indirect jumps, which is bad since it's a
 325 C different target each time.  There seems no way to avoid this.
 326
 327 dnl  This value also hard coded in some shifts and adds
 328 deflit(CODE_BYTES_PER_LIMB, 17)
 329
 330 dnl  With the unmodified &src[size] and &dst[size] pointers, the
 331 dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
 332 dnl  values up to 31, but above that an offset must be added to them.
 333
 334 deflit(OFFSET,
 335 ifelse(eval(UNROLL_COUNT>31),1,
 336 eval((UNROLL_COUNT-31)*4),
 337 0))
 338
 339 dnl  Because the last chunk of code is generated differently, a label placed
 340 dnl  at the end doesn't work.  Instead calculate the implied end using the
 341 dnl  start and how many chunks of code there are.
 342
 343 deflit(UNROLL_INNER_END,
 344 `L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
 345
 346         C eax
 347         C ebx   carry
 348         C ecx
 349         C edx
 350         C esi   &src[size]
 351         C edi   &dst[size]
 352         C ebp
 353
 354         movl    PARAM_SIZE, %ecx
 355         movl    %ebx, (%edi)
 356
 357         subl    $4, %ecx
 358         jz      L(corner)
 359
 360         negl    %ecx
 361 ifelse(OFFSET,0,,`subl  $OFFSET, %edi')
 362 ifelse(OFFSET,0,,`subl  $OFFSET, %esi')
 363
 364         movl    %ecx, %edx
 365         shll    $4, %ecx
 366
 367 ifdef(`PIC',`
 368         call    L(pic_calc)
 369 L(here):
 370 ',`
 371         leal    UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
 372 ')
 373
 374
 375         C The calculated jump mustn't come out to before the start of the
 376         C code available.  This is the limit UNROLL_COUNT puts on the src
 377         C operand size, but checked here directly using the jump address.
 378         ASSERT(ae,
 379         `movl_text_address(L(unroll_inner_start), %eax)
 380         cmpl    %eax, %ecx')
 381
 382
 383 C------------------------------------------------------------------------------
 384         ALIGN(16)
 385 L(unroll_outer_top):
 386         C eax
 387         C ebx   high limb to store
 388         C ecx   VAR_JMP
 389         C edx   VAR_COUNTER, limbs, negative
 390         C esi   &src[size], constant
 391         C edi   dst ptr, high of last addmul
 392         C ebp
 393
 394         movl    -12+OFFSET(%esi,%edx,4), %ebp   C next multiplier
 395         movl    -8+OFFSET(%esi,%edx,4), %eax    C first of multiplicand
 396
 397         movl    %edx, VAR_COUNTER
 398
 399         mull    %ebp
 400
 401 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
 402
 403         testb   $1, %cl
 404         movl    %edx, %ebx      C high carry
 405         movl    %ecx, %edx      C jump
 406
 407         movl    %eax, %ecx      C low carry
 408         cmovX(  %ebx, %ecx)     C high carry reverse
 409         cmovX(  %eax, %ebx)     C low carry reverse
 410
 411         leal    CODE_BYTES_PER_LIMB(%edx), %eax
 412         xorl    %edx, %edx
 413         leal    4(%edi), %edi
 414
 415         movl    %eax, VAR_JMP
 416
 417         jmp     *%eax
 418
 419
 420 ifdef(`PIC',`
 421 L(pic_calc):
 422         addl    (%esp), %ecx
 423         addl    $UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
 424         addl    %edx, %ecx
 425         ret_internal
 426 ')
 427
 428
 429         C Must be an even address to preserve the significance of the low
 430         C bit of the jump address indicating which way around ecx/ebx should
 431         C start.
 432         ALIGN(2)
 433
 434 L(unroll_inner_start):
 435         C eax   next limb
 436         C ebx   carry high
 437         C ecx   carry low
 438         C edx   scratch
 439         C esi   src
 440         C edi   dst
 441         C ebp   multiplier
 442
 443 forloop(`i', UNROLL_COUNT, 1, `
 444         deflit(`disp_src', eval(-i*4 + OFFSET))
 445         deflit(`disp_dst', eval(disp_src - 4))
 446
 447         m4_assert(`disp_src>=-128 && disp_src<128')
 448         m4_assert(`disp_dst>=-128 && disp_dst<128')
 449
 450 ifelse(eval(i%2),0,`
 451 Zdisp(  movl,   disp_src,(%esi), %eax)
 452         adcl    %edx, %ebx
 453
 454         mull    %ebp
 455
 456 Zdisp(  addl,   %ecx, disp_dst,(%edi))
 457         movl    $0, %ecx
 458
 459         adcl    %eax, %ebx
 460
 461 ',`
 462         dnl  this bit comes out last
 463 Zdisp(  movl,   disp_src,(%esi), %eax)
 464         adcl    %edx, %ecx
 465
 466         mull    %ebp
 467
 468 Zdisp(  addl,   %ebx, disp_dst,(%edi))
 469
 470 ifelse(forloop_last,0,
 471 `       movl    $0, %ebx')
 472
 473         adcl    %eax, %ecx
 474 ')
 475 ')
 476
 477         C eax   next limb
 478         C ebx   carry high
 479         C ecx   carry low
 480         C edx   scratch
 481         C esi   src
 482         C edi   dst
 483         C ebp   multiplier
 484
 485         adcl    $0, %edx
 486         addl    %ecx, -4+OFFSET(%edi)
 487         movl    VAR_JMP, %ecx
 488
 489         adcl    $0, %edx
 490
 491         movl    %edx, m4_empty_if_zero(OFFSET) (%edi)
 492         movl    VAR_COUNTER, %edx
 493
 494         incl    %edx
 495         jnz     L(unroll_outer_top)
 496
 497
 498 ifelse(OFFSET,0,,`
 499         addl    $OFFSET, %esi
 500         addl    $OFFSET, %edi
 501 ')
 502
 503
 504 C------------------------------------------------------------------------------
 505 L(corner):
 506         C esi   &src[size]
 507         C edi   &dst[2*size-5]
 508
 509         movl    -12(%esi), %ebp
 510         movl    -8(%esi), %eax
 511         movl    %eax, %ecx
 512
 513         mull    %ebp
 514
 515         addl    %eax, -4(%edi)
 516         movl    -4(%esi), %eax
 517
 518         adcl    $0, %edx
 519         movl    %edx, %ebx
 520         movl    %eax, %esi
 521
 522         mull    %ebp
 523
 524         addl    %ebx, %eax
 525
 526         adcl    $0, %edx
 527         addl    %eax, (%edi)
 528         movl    %esi, %eax
 529
 530         adcl    $0, %edx
 531         movl    %edx, %ebx
 532
 533         mull    %ecx
 534
 535         addl    %ebx, %eax
 536         movl    %eax, 4(%edi)
 537
 538         adcl    $0, %edx
 539         movl    %edx, 8(%edi)
 540
 541
 542
 543 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 544
 545 L(lshift_start):
 546         movl    PARAM_SIZE, %eax
 547         movl    PARAM_DST, %edi
 548         xorl    %ecx, %ecx              C clear carry
 549
 550         leal    (%edi,%eax,8), %edi
 551         notl    %eax                    C -size-1, preserve carry
 552
 553         leal    2(%eax), %eax           C -(size-1)
 554
 555 L(lshift):
 556         C eax   counter, negative
 557         C ebx
 558         C ecx
 559         C edx
 560         C esi
 561         C edi   dst, pointing just after last limb
 562         C ebp
 563
 564         rcll    -4(%edi,%eax,8)
 565         rcll    (%edi,%eax,8)
 566         incl    %eax
 567         jnz     L(lshift)
 568
 569         setc    %al
 570
 571         movl    PARAM_SRC, %esi
 572         movl    %eax, -4(%edi)          C dst most significant limb
 573
 574         movl    PARAM_SIZE, %ecx
 575
 576
 577 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
 578 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 579 C low limb of src[0]^2.
 580
 581         movl    (%esi), %eax            C src[0]
 582
 583         mull    %eax
 584
 585         leal    (%esi,%ecx,4), %esi     C src point just after last limb
 586         negl    %ecx
 587
 588         movl    %eax, (%edi,%ecx,8)     C dst[0]
 589         incl    %ecx
 590
 591 L(diag):
 592         C eax   scratch
 593         C ebx   scratch
 594         C ecx   counter, negative
 595         C edx   carry
 596         C esi   src just after last limb
 597         C edi   dst just after last limb
 598         C ebp
 599
 600         movl    (%esi,%ecx,4), %eax
 601         movl    %edx, %ebx
 602
 603         mull    %eax
 604
 605         addl    %ebx, -4(%edi,%ecx,8)
 606         adcl    %eax, (%edi,%ecx,8)
 607         adcl    $0, %edx
 608
 609         incl    %ecx
 610         jnz     L(diag)
 611
 612
 613         movl    SAVE_ESI, %esi
 614         movl    SAVE_EBX, %ebx
 615
 616         addl    %edx, -4(%edi)          C dst most significant limb
 617         movl    SAVE_EDI, %edi
 618
 619         movl    SAVE_EBP, %ebp
 620         addl    $FRAME, %esp
 621
 622         ret
 623
 624 EPILOGUE()