mpn/x86/k7/mul_basecase.asm

   1 dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
  24 C     limbs/loop unrolling).
  25
  26
  27
  28 dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
  29 dnl           8           4.67
  30 dnl          16           4.59
  31 dnl          32           4.42
  32 dnl  Maximum possible with the current code is 32.
  33 dnl
  34 dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
  35 dnl  done with a straight run through a block of code, no inner loop.  Using
  36 dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
  37
  38 deflit(UNROLL_COUNT, 32)
  39
  40
  41 C void mpn_mul_basecase (mp_ptr wp,
  42 C                        mp_srcptr xp, mp_size_t xsize,
  43 C                        mp_srcptr yp, mp_size_t ysize);
  44 C
  45 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
  46 C wp,xsize+ysize.
  47 C
  48 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  49 C it's faster because it does most of the mpn_addmul_1() startup
  50 C calculations only once.  The saving is 15-25% on typical sizes coming from
  51 C the Karatsuba multiply code.
  52
  53 ifdef(`PIC',`
  54 deflit(UNROLL_THRESHOLD, 5)
  55 ',`
  56 deflit(UNROLL_THRESHOLD, 5)
  57 ')
  58
  59 defframe(PARAM_YSIZE,20)
  60 defframe(PARAM_YP,   16)
  61 defframe(PARAM_XSIZE,12)
  62 defframe(PARAM_XP,   8)
  63 defframe(PARAM_WP,   4)
  64
  65         TEXT
  66         ALIGN(32)
  67 PROLOGUE(mpn_mul_basecase)
  68 deflit(`FRAME',0)
  69
  70         movl    PARAM_XSIZE, %ecx
  71         movl    PARAM_YP, %eax
  72
  73         movl    PARAM_XP, %edx
  74         movl    (%eax), %eax    C yp low limb
  75
  76         cmpl    $2, %ecx
  77         ja      L(xsize_more_than_two)
  78         je      L(two_by_something)
  79
  80
  81         C one limb by one limb
  82
  83         mull    (%edx)
  84
  85         movl    PARAM_WP, %ecx
  86         movl    %eax, (%ecx)
  87         movl    %edx, 4(%ecx)
  88         ret
  89
  90
  91 C -----------------------------------------------------------------------------
  92 L(two_by_something):
  93 deflit(`FRAME',0)
  94         decl    PARAM_YSIZE
  95         pushl   %ebx            defframe_pushl(`SAVE_EBX')
  96         movl    %eax, %ecx      C yp low limb
  97
  98         movl    PARAM_WP, %ebx
  99         pushl   %esi            defframe_pushl(`SAVE_ESI')
 100         movl    %edx, %esi      C xp
 101
 102         movl    (%edx), %eax    C xp low limb
 103         jnz     L(two_by_two)
 104
 105
 106         C two limbs by one limb
 107
 108         mull    %ecx
 109
 110         movl    %eax, (%ebx)
 111         movl    4(%esi), %eax
 112         movl    %edx, %esi      C carry
 113
 114         mull    %ecx
 115
 116         addl    %eax, %esi
 117
 118         movl    %esi, 4(%ebx)
 119         movl    SAVE_ESI, %esi
 120
 121         adcl    $0, %edx
 122
 123         movl    %edx, 8(%ebx)
 124         movl    SAVE_EBX, %ebx
 125         addl    $FRAME, %esp
 126
 127         ret
 128
 129
 130
 131 C -----------------------------------------------------------------------------
 132 C Could load yp earlier into another register.
 133
 134         ALIGN(16)
 135 L(two_by_two):
 136         C eax   xp low limb
 137         C ebx   wp
 138         C ecx   yp low limb
 139         C edx
 140         C esi   xp
 141         C edi
 142         C ebp
 143
 144 dnl  FRAME carries on from previous
 145
 146         mull    %ecx            C xp[0] * yp[0]
 147
 148         push    %edi            defframe_pushl(`SAVE_EDI')
 149         movl    %edx, %edi      C carry, for wp[1]
 150
 151         movl    %eax, (%ebx)
 152         movl    4(%esi), %eax
 153
 154         mull    %ecx            C xp[1] * yp[0]
 155
 156         addl    %eax, %edi
 157         movl    PARAM_YP, %ecx
 158
 159         adcl    $0, %edx
 160         movl    4(%ecx), %ecx   C yp[1]
 161         movl    %edi, 4(%ebx)
 162
 163         movl    4(%esi), %eax   C xp[1]
 164         movl    %edx, %edi      C carry, for wp[2]
 165
 166         mull    %ecx            C xp[1] * yp[1]
 167
 168         addl    %eax, %edi
 169
 170         adcl    $0, %edx
 171         movl    (%esi), %eax    C xp[0]
 172
 173         movl    %edx, %esi      C carry, for wp[3]
 174
 175         mull    %ecx            C xp[0] * yp[1]
 176
 177         addl    %eax, 4(%ebx)
 178         adcl    %edx, %edi
 179         movl    %edi, 8(%ebx)
 180
 181         adcl    $0, %esi
 182         movl    SAVE_EDI, %edi
 183         movl    %esi, 12(%ebx)
 184
 185         movl    SAVE_ESI, %esi
 186         movl    SAVE_EBX, %ebx
 187         addl    $FRAME, %esp
 188
 189         ret
 190
 191
 192 C -----------------------------------------------------------------------------
 193         ALIGN(16)
 194 L(xsize_more_than_two):
 195
 196 C The first limb of yp is processed with a simple mpn_mul_1 style loop
 197 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
 198 C (whereas the addmul below is run ysize-1 many times).  A call to the
 199 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
 200 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
 201 C limb operations the Karatsuba code calls here with.
 202
 203         C eax   yp[0]
 204         C ebx
 205         C ecx   xsize
 206         C edx   xp
 207         C esi
 208         C edi
 209         C ebp
 210
 211 dnl  FRAME doesn't carry on from previous, no pushes yet here
 212 defframe(`SAVE_EBX',-4)
 213 defframe(`SAVE_ESI',-8)
 214 defframe(`SAVE_EDI',-12)
 215 defframe(`SAVE_EBP',-16)
 216 deflit(`FRAME',0)
 217
 218         subl    $16, %esp
 219 deflit(`FRAME',16)
 220
 221         movl    %edi, SAVE_EDI
 222         movl    PARAM_WP, %edi
 223
 224         movl    %ebx, SAVE_EBX
 225         movl    %ebp, SAVE_EBP
 226         movl    %eax, %ebp
 227
 228         movl    %esi, SAVE_ESI
 229         xorl    %ebx, %ebx
 230         leal    (%edx,%ecx,4), %esi     C xp end
 231
 232         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 233         negl    %ecx
 234
 235
 236 L(mul1):
 237         C eax   scratch
 238         C ebx   carry
 239         C ecx   counter, negative
 240         C edx   scratch
 241         C esi   xp end
 242         C edi   wp end of mul1
 243         C ebp   multiplier
 244
 245         movl    (%esi,%ecx,4), %eax
 246
 247         mull    %ebp
 248
 249         addl    %ebx, %eax
 250         movl    %eax, (%edi,%ecx,4)
 251         movl    $0, %ebx
 252
 253         adcl    %edx, %ebx
 254         incl    %ecx
 255         jnz     L(mul1)
 256
 257
 258         movl    PARAM_YSIZE, %edx
 259         movl    PARAM_XSIZE, %ecx
 260
 261         movl    %ebx, (%edi)            C final carry
 262         decl    %edx
 263
 264         jnz     L(ysize_more_than_one)
 265
 266
 267         movl    SAVE_EDI, %edi
 268         movl    SAVE_EBX, %ebx
 269
 270         movl    SAVE_EBP, %ebp
 271         movl    SAVE_ESI, %esi
 272         addl    $FRAME, %esp
 273
 274         ret
 275
 276
 277 L(ysize_more_than_one):
 278         cmpl    $UNROLL_THRESHOLD, %ecx
 279         movl    PARAM_YP, %eax
 280
 281         jae     L(unroll)
 282
 283
 284 C -----------------------------------------------------------------------------
 285         C simple addmul looping
 286         C
 287         C eax   yp
 288         C ebx
 289         C ecx   xsize
 290         C edx   ysize-1
 291         C esi   xp end
 292         C edi   wp end of mul1
 293         C ebp
 294
 295         leal    4(%eax,%edx,4), %ebp    C yp end
 296         negl    %ecx
 297         negl    %edx
 298
 299         movl    (%esi,%ecx,4), %eax     C xp low limb
 300         movl    %edx, PARAM_YSIZE       C -(ysize-1)
 301         incl    %ecx
 302
 303         xorl    %ebx, %ebx              C initial carry
 304         movl    %ecx, PARAM_XSIZE       C -(xsize-1)
 305         movl    %ebp, PARAM_YP
 306
 307         movl    (%ebp,%edx,4), %ebp     C yp second lowest limb - multiplier
 308         jmp     L(simple_outer_entry)
 309
 310
 311         C this is offset 0x121 so close enough to aligned
 312 L(simple_outer_top):
 313         C ebp   ysize counter, negative
 314
 315         movl    PARAM_YP, %edx
 316         movl    PARAM_XSIZE, %ecx       C -(xsize-1)
 317         xorl    %ebx, %ebx              C carry
 318
 319         movl    %ebp, PARAM_YSIZE
 320         addl    $4, %edi                C next position in wp
 321
 322         movl    (%edx,%ebp,4), %ebp     C yp limb - multiplier
 323         movl    -4(%esi,%ecx,4), %eax   C xp low limb
 324
 325
 326 L(simple_outer_entry):
 327
 328 L(simple_inner):
 329         C eax   xp limb
 330         C ebx   carry limb
 331         C ecx   loop counter (negative)
 332         C edx   scratch
 333         C esi   xp end
 334         C edi   wp end
 335         C ebp   multiplier
 336
 337         mull    %ebp
 338
 339         addl    %eax, %ebx
 340         adcl    $0, %edx
 341
 342         addl    %ebx, (%edi,%ecx,4)
 343         movl    (%esi,%ecx,4), %eax
 344         adcl    $0, %edx
 345
 346         incl    %ecx
 347         movl    %edx, %ebx
 348         jnz     L(simple_inner)
 349
 350
 351         mull    %ebp
 352
 353         movl    PARAM_YSIZE, %ebp
 354         addl    %eax, %ebx
 355
 356         adcl    $0, %edx
 357         addl    %ebx, (%edi)
 358
 359         adcl    $0, %edx
 360         incl    %ebp
 361
 362         movl    %edx, 4(%edi)
 363         jnz     L(simple_outer_top)
 364
 365
 366         movl    SAVE_EBX, %ebx
 367         movl    SAVE_ESI, %esi
 368
 369         movl    SAVE_EDI, %edi
 370         movl    SAVE_EBP, %ebp
 371         addl    $FRAME, %esp
 372
 373         ret
 374
 375
 376
 377 C -----------------------------------------------------------------------------
 378 C
 379 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
 380 C comments.
 381 C
 382 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
 383 C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
 384 C to given an initial VAR_COUNTER at the top of the outer loop.
 385 C
 386 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
 387 C up to -1, inclusive.
 388 C
 389 C VAR_JMP is the computed jump into the unrolled loop.
 390 C
 391 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 392 C start of the unrolled loop.
 393 C
 394 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 395 C inclusive.
 396 C
 397 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 398 C added to give the location of the next limb of yp, which is the multiplier
 399 C in the unrolled loop.
 400 C
 401 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
 402 C outer loop to take care of xp, wp and the inner loop counter.
 403
 404 defframe(VAR_COUNTER,  -20)
 405 defframe(VAR_ADJUST,   -24)
 406 defframe(VAR_JMP,      -28)
 407 defframe(VAR_XP_LOW,   -32)
 408 deflit(VAR_EXTRA_SPACE, 16)
 409
 410
 411 L(unroll):
 412         C eax   yp
 413         C ebx
 414         C ecx   xsize
 415         C edx   ysize-1
 416         C esi   xp end
 417         C edi   wp end of mul1
 418         C ebp
 419
 420         movl    PARAM_XP, %esi
 421         movl    4(%eax), %ebp           C multiplier (yp second limb)
 422         leal    4(%eax,%edx,4), %eax    C yp adjust for ysize indexing
 423
 424         movl    PARAM_WP, %edi
 425         movl    %eax, PARAM_YP
 426         negl    %edx
 427
 428         movl    %edx, PARAM_YSIZE
 429         leal    UNROLL_COUNT-2(%ecx), %ebx      C (xsize-1)+UNROLL_COUNT-1
 430         decl    %ecx                            C xsize-1
 431
 432         movl    (%esi), %eax            C xp low limb
 433         andl    $-UNROLL_MASK-1, %ebx
 434         negl    %ecx
 435
 436         subl    $VAR_EXTRA_SPACE, %esp
 437 deflit(`FRAME',16+VAR_EXTRA_SPACE)
 438         negl    %ebx
 439         andl    $UNROLL_MASK, %ecx
 440
 441         movl    %ebx, VAR_ADJUST
 442         movl    %ecx, %edx
 443         shll    $4, %ecx
 444
 445         sarl    $UNROLL_LOG2, %ebx
 446
 447         C 17 code bytes per limb
 448 ifdef(`PIC',`
 449         call    L(pic_calc)
 450 L(unroll_here):
 451 ',`
 452         leal    L(unroll_entry) (%ecx,%edx,1), %ecx
 453 ')
 454         negl    %edx
 455
 456         movl    %eax, VAR_XP_LOW
 457         movl    %ecx, VAR_JMP
 458         leal    4(%edi,%edx,4), %edi    C wp and xp, adjust for unrolling,
 459         leal    4(%esi,%edx,4), %esi    C  and start at second limb
 460         jmp     L(unroll_outer_entry)
 461
 462
 463 ifdef(`PIC',`
 464 L(pic_calc):
 465         C See mpn/x86/README about old gas bugs
 466         leal    (%ecx,%edx,1), %ecx
 467         addl    $L(unroll_entry)-L(unroll_here), %ecx
 468         addl    (%esp), %ecx
 469         ret_internal
 470 ')
 471
 472
 473 C --------------------------------------------------------------------------
 474         ALIGN(32)
 475 L(unroll_outer_top):
 476         C ebp   ysize counter, negative
 477
 478         movl    VAR_ADJUST, %ebx
 479         movl    PARAM_YP, %edx
 480
 481         movl    VAR_XP_LOW, %eax
 482         movl    %ebp, PARAM_YSIZE       C store incremented ysize counter
 483
 484         leal    4(%edi,%ebx,4), %edi
 485         leal    (%esi,%ebx,4), %esi
 486         sarl    $UNROLL_LOG2, %ebx
 487
 488         movl    (%edx,%ebp,4), %ebp     C yp next multiplier
 489         movl    VAR_JMP, %ecx
 490
 491 L(unroll_outer_entry):
 492         mull    %ebp
 493
 494         testb   $1, %cl         C and clear carry bit
 495         movl    %ebx, VAR_COUNTER
 496         movl    $0, %ebx
 497
 498         movl    $0, %ecx
 499         cmovz(  %eax, %ecx)     C eax into low carry, zero into high carry limb
 500         cmovnz( %eax, %ebx)
 501
 502         C Extra fetch of VAR_JMP is bad, but registers are tight
 503         jmp     *VAR_JMP
 504
 505
 506 C -----------------------------------------------------------------------------
 507         ALIGN(32)
 508 L(unroll_top):
 509         C eax   xp limb
 510         C ebx   carry high
 511         C ecx   carry low
 512         C edx   scratch
 513         C esi   xp+8
 514         C edi   wp
 515         C ebp   yp multiplier limb
 516         C
 517         C VAR_COUNTER  loop counter, negative
 518         C
 519         C 17 bytes each limb
 520
 521 L(unroll_entry):
 522
 523 deflit(CHUNK_COUNT,2)
 524 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 525         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 526         deflit(`disp1', eval(disp0 + 4))
 527
 528 Zdisp(  movl,   disp0,(%esi), %eax)
 529         adcl    %edx, %ebx
 530
 531         mull    %ebp
 532
 533 Zdisp(  addl,   %ecx, disp0,(%edi))
 534         movl    $0, %ecx
 535
 536         adcl    %eax, %ebx
 537
 538
 539         movl    disp1(%esi), %eax
 540         adcl    %edx, %ecx
 541
 542         mull    %ebp
 543
 544         addl    %ebx, disp1(%edi)
 545         movl    $0, %ebx
 546
 547         adcl    %eax, %ecx
 548 ')
 549
 550
 551         incl    VAR_COUNTER
 552         leal    UNROLL_BYTES(%esi), %esi
 553         leal    UNROLL_BYTES(%edi), %edi
 554
 555         jnz     L(unroll_top)
 556
 557
 558         C eax
 559         C ebx   zero
 560         C ecx   low
 561         C edx   high
 562         C esi
 563         C edi   wp, pointing at second last limb)
 564         C ebp
 565         C
 566         C carry flag to be added to high
 567
 568 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 569 deflit(`disp1', eval(disp0-0 + 4))
 570
 571         movl    PARAM_YSIZE, %ebp
 572         adcl    $0, %edx
 573         addl    %ecx, disp0(%edi)
 574
 575         adcl    $0, %edx
 576         incl    %ebp
 577
 578         movl    %edx, disp1(%edi)
 579         jnz     L(unroll_outer_top)
 580
 581
 582         movl    SAVE_ESI, %esi
 583         movl    SAVE_EBP, %ebp
 584
 585         movl    SAVE_EDI, %edi
 586         movl    SAVE_EBX, %ebx
 587         addl    $FRAME, %esp
 588
 589         ret
 590
 591 EPILOGUE()