mpn/x86/k6/mul_basecase.asm

   1 dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
  24 C     unrolling).
  25
  26
  27
  28 dnl  K6: UNROLL_COUNT cycles/product (approx)
  29 dnl           8           9.75
  30 dnl          16           9.3
  31 dnl          32           9.3
  32 dnl  Maximum possible with the current code is 32.
  33 dnl
  34 dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
  35 dnl  might explain it's good performance.
  36
  37 deflit(UNROLL_COUNT, 16)
  38
  39
  40 C void mpn_mul_basecase (mp_ptr wp,
  41 C                        mp_srcptr xp, mp_size_t xsize,
  42 C                        mp_srcptr yp, mp_size_t ysize);
  43 C
  44 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
  45 C wp,xsize+ysize.
  46 C
  47 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
  48 C it's faster because it does most of the mpn_addmul_1() entry code only
  49 C once.  The saving is about 10-20% on typical sizes coming from the
  50 C Karatsuba multiply code.
  51 C
  52 C Enhancements:
  53 C
  54 C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
  55 C c/l.  Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
  56 C
  57 C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
  58 C extra stack setups and maybe 2 or 3 wasted cycles at the end.  Code saving
  59 C would be 256 bytes.
  60
  61 ifdef(`PIC',`
  62 deflit(UNROLL_THRESHOLD, 8)
  63 ',`
  64 deflit(UNROLL_THRESHOLD, 8)
  65 ')
  66
  67 defframe(PARAM_YSIZE,20)
  68 defframe(PARAM_YP,   16)
  69 defframe(PARAM_XSIZE,12)
  70 defframe(PARAM_XP,   8)
  71 defframe(PARAM_WP,   4)
  72
  73         TEXT
  74         ALIGN(32)
  75 PROLOGUE(mpn_mul_basecase)
  76 deflit(`FRAME',0)
  77
  78         movl    PARAM_XSIZE, %ecx
  79         movl    PARAM_YP, %eax
  80
  81         movl    PARAM_XP, %edx
  82         movl    (%eax), %eax    C yp low limb
  83
  84         cmpl    $2, %ecx
  85         ja      L(xsize_more_than_two_limbs)
  86         je      L(two_by_something)
  87
  88
  89         C one limb by one limb
  90
  91         movl    (%edx), %edx    C xp low limb
  92         movl    PARAM_WP, %ecx
  93
  94         mull    %edx
  95
  96         movl    %eax, (%ecx)
  97         movl    %edx, 4(%ecx)
  98         ret
  99
 100
 101 C -----------------------------------------------------------------------------
 102 L(two_by_something):
 103         decl    PARAM_YSIZE
 104         pushl   %ebx
 105 deflit(`FRAME',4)
 106
 107         movl    PARAM_WP, %ebx
 108         pushl   %esi
 109 deflit(`FRAME',8)
 110
 111         movl    %eax, %ecx      C yp low limb
 112         movl    (%edx), %eax    C xp low limb
 113
 114         movl    %edx, %esi      C xp
 115         jnz     L(two_by_two)
 116
 117
 118         C two limbs by one limb
 119
 120         mull    %ecx
 121
 122         movl    %eax, (%ebx)
 123         movl    4(%esi), %eax
 124
 125         movl    %edx, %esi      C carry
 126
 127         mull    %ecx
 128
 129         addl    %eax, %esi
 130         movl    %esi, 4(%ebx)
 131
 132         adcl    $0, %edx
 133
 134         movl    %edx, 8(%ebx)
 135         popl    %esi
 136
 137         popl    %ebx
 138         ret
 139
 140
 141
 142 C -----------------------------------------------------------------------------
 143         ALIGN(16)
 144 L(two_by_two):
 145         C eax   xp low limb
 146         C ebx   wp
 147         C ecx   yp low limb
 148         C edx
 149         C esi   xp
 150         C edi
 151         C ebp
 152 deflit(`FRAME',8)
 153
 154         mull    %ecx            C xp[0] * yp[0]
 155
 156         push    %edi
 157 deflit(`FRAME',12)
 158         movl    %eax, (%ebx)
 159
 160         movl    4(%esi), %eax
 161         movl    %edx, %edi      C carry, for wp[1]
 162
 163         mull    %ecx            C xp[1] * yp[0]
 164
 165         addl    %eax, %edi
 166         movl    PARAM_YP, %ecx
 167
 168         adcl    $0, %edx
 169
 170         movl    %edi, 4(%ebx)
 171         movl    4(%ecx), %ecx   C yp[1]
 172
 173         movl    4(%esi), %eax   C xp[1]
 174         movl    %edx, %edi      C carry, for wp[2]
 175
 176         mull    %ecx            C xp[1] * yp[1]
 177
 178         addl    %eax, %edi
 179
 180         adcl    $0, %edx
 181
 182         movl    (%esi), %eax    C xp[0]
 183         movl    %edx, %esi      C carry, for wp[3]
 184
 185         mull    %ecx            C xp[0] * yp[1]
 186
 187         addl    %eax, 4(%ebx)
 188         adcl    %edx, %edi
 189         adcl    $0, %esi
 190
 191         movl    %edi, 8(%ebx)
 192         popl    %edi
 193
 194         movl    %esi, 12(%ebx)
 195         popl    %esi
 196
 197         popl    %ebx
 198         ret
 199
 200
 201 C -----------------------------------------------------------------------------
 202         ALIGN(16)
 203 L(xsize_more_than_two_limbs):
 204
 205 C The first limb of yp is processed with a simple mpn_mul_1 style loop
 206 C inline.  Unrolling this doesn't seem worthwhile since it's only run once
 207 C (whereas the addmul below is run ysize-1 many times).  A call to the
 208 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
 209 C popping, and doesn't seem likely to be worthwhile on the typical 10-20
 210 C limb operations the Karatsuba code calls here with.
 211
 212         C eax   yp[0]
 213         C ebx
 214         C ecx   xsize
 215         C edx   xp
 216         C esi
 217         C edi
 218         C ebp
 219 deflit(`FRAME',0)
 220
 221         pushl   %edi            defframe_pushl(SAVE_EDI)
 222         pushl   %ebp            defframe_pushl(SAVE_EBP)
 223
 224         movl    PARAM_WP, %edi
 225         pushl   %esi            defframe_pushl(SAVE_ESI)
 226
 227         movl    %eax, %ebp
 228         pushl   %ebx            defframe_pushl(SAVE_EBX)
 229
 230         leal    (%edx,%ecx,4), %ebx     C xp end
 231         xorl    %esi, %esi
 232
 233         leal    (%edi,%ecx,4), %edi     C wp end of mul1
 234         negl    %ecx
 235
 236
 237 L(mul1):
 238         C eax   scratch
 239         C ebx   xp end
 240         C ecx   counter, negative
 241         C edx   scratch
 242         C esi   carry
 243         C edi   wp end of mul1
 244         C ebp   multiplier
 245
 246         movl    (%ebx,%ecx,4), %eax
 247
 248         mull    %ebp
 249
 250         addl    %esi, %eax
 251         movl    $0, %esi
 252
 253         adcl    %edx, %esi
 254
 255         movl    %eax, (%edi,%ecx,4)
 256         incl    %ecx
 257
 258         jnz     L(mul1)
 259
 260
 261         movl    PARAM_YSIZE, %edx
 262         movl    %esi, (%edi)            C final carry
 263
 264         movl    PARAM_XSIZE, %ecx
 265         decl    %edx
 266
 267         jnz     L(ysize_more_than_one_limb)
 268
 269         popl    %ebx
 270         popl    %esi
 271         popl    %ebp
 272         popl    %edi
 273         ret
 274
 275
 276 L(ysize_more_than_one_limb):
 277         cmpl    $UNROLL_THRESHOLD, %ecx
 278         movl    PARAM_YP, %eax
 279
 280         jae     L(unroll)
 281
 282
 283 C -----------------------------------------------------------------------------
 284 C Simple addmul loop.
 285 C
 286 C Using ebx and edi pointing at the ends of their respective locations saves
 287 C a couple of instructions in the outer loop.  The inner loop is still 11
 288 C cycles, the same as the simple loop in aorsmul_1.asm.
 289
 290         C eax   yp
 291         C ebx   xp end
 292         C ecx   xsize
 293         C edx   ysize-1
 294         C esi
 295         C edi   wp end of mul1
 296         C ebp
 297
 298         movl    4(%eax), %ebp           C multiplier
 299         negl    %ecx
 300
 301         movl    %ecx, PARAM_XSIZE       C -xsize
 302         xorl    %esi, %esi              C initial carry
 303
 304         leal    4(%eax,%edx,4), %eax    C yp end
 305         negl    %edx
 306
 307         movl    %eax, PARAM_YP
 308         movl    %edx, PARAM_YSIZE
 309
 310         jmp     L(simple_outer_entry)
 311
 312
 313         C aligning here saves a couple of cycles
 314         ALIGN(16)
 315 L(simple_outer_top):
 316         C edx   ysize counter, negative
 317
 318         movl    PARAM_YP, %eax          C yp end
 319         xorl    %esi, %esi              C carry
 320
 321         movl    PARAM_XSIZE, %ecx       C -xsize
 322         movl    %edx, PARAM_YSIZE
 323
 324         movl    (%eax,%edx,4), %ebp     C yp limb multiplier
 325 L(simple_outer_entry):
 326         addl    $4, %edi
 327
 328
 329 L(simple_inner):
 330         C eax   scratch
 331         C ebx   xp end
 332         C ecx   counter, negative
 333         C edx   scratch
 334         C esi   carry
 335         C edi   wp end of this addmul
 336         C ebp   multiplier
 337
 338         movl    (%ebx,%ecx,4), %eax
 339
 340         mull    %ebp
 341
 342         addl    %esi, %eax
 343         movl    $0, %esi
 344
 345         adcl    $0, %edx
 346         addl    %eax, (%edi,%ecx,4)
 347         adcl    %edx, %esi
 348
 349         incl    %ecx
 350         jnz     L(simple_inner)
 351
 352
 353         movl    PARAM_YSIZE, %edx
 354         movl    %esi, (%edi)
 355
 356         incl    %edx
 357         jnz     L(simple_outer_top)
 358
 359
 360         popl    %ebx
 361         popl    %esi
 362         popl    %ebp
 363         popl    %edi
 364         ret
 365
 366
 367 C -----------------------------------------------------------------------------
 368 C Unrolled loop.
 369 C
 370 C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
 371 C some comments.
 372 C
 373 C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
 374 C 0, inclusive.
 375 C
 376 C VAR_JMP is the computed jump into the unrolled loop.
 377 C
 378 C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
 379 C is entered.
 380 C
 381 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
 382 C start of the unrolled loop.  This can't just be fetched through the xp
 383 C pointer because of the offset applied to it.
 384 C
 385 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
 386 C inclusive.
 387 C
 388 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
 389 C added to give the location of the next limb of yp, which is the multiplier
 390 C in the unrolled loop.
 391 C
 392 C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
 393 C to give the starting point in the destination for each unrolled loop (this
 394 C point is one limb upwards for each limb of yp processed).
 395 C
 396 C Having PARAM_YSIZE count negative to zero means it's not necessary to
 397 C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
 398 C the stack remain constant and on each loop an leal adjusts them with the
 399 C PARAM_YSIZE counter value.
 400
 401
 402 defframe(VAR_COUNTER,      -20)
 403 defframe(VAR_COUNTER_INIT, -24)
 404 defframe(VAR_JMP,          -28)
 405 defframe(VAR_XP_LOW,       -32)
 406 deflit(VAR_STACK_SPACE, 16)
 407
 408 dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
 409 dnl  slower in this code, hence the defframe empty-if-zero feature is
 410 dnl  disabled.
 411 dnl
 412 dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
 413 dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
 414 dnl  anything isn't clear.
 415 dnl
 416 define(`defframe_empty_if_zero_disabled',1)
 417
 418 L(unroll):
 419         C eax   yp (not used)
 420         C ebx   xp end (not used)
 421         C ecx   xsize
 422         C edx   ysize-1
 423         C esi
 424         C edi   wp end of mul1 (not used)
 425         C ebp
 426 deflit(`FRAME', 16)
 427
 428         leal    -2(%ecx), %ebp  C one limb processed at start,
 429         decl    %ecx            C and ebp is one less
 430
 431         shrl    $UNROLL_LOG2, %ebp
 432         negl    %ecx
 433
 434         subl    $VAR_STACK_SPACE, %esp
 435 deflit(`FRAME', 16+VAR_STACK_SPACE)
 436         andl    $UNROLL_MASK, %ecx
 437
 438         movl    %ecx, %esi
 439         shll    $4, %ecx
 440
 441         movl    %ebp, VAR_COUNTER_INIT
 442         negl    %esi
 443
 444         C 15 code bytes per limb
 445 ifdef(`PIC',`
 446         call    L(pic_calc)
 447 L(unroll_here):
 448 ',`
 449         leal    L(unroll_entry) (%ecx,%esi,1), %ecx
 450 ')
 451
 452         movl    PARAM_XP, %ebx
 453         movl    %ebp, VAR_COUNTER
 454
 455         movl    PARAM_WP, %edi
 456         movl    %ecx, VAR_JMP
 457
 458         movl    (%ebx), %eax
 459         leal    4(%edi,%esi,4), %edi    C wp adjust for unrolling and mul1
 460
 461         leal    (%ebx,%esi,4), %ebx     C xp adjust for unrolling
 462
 463         movl    %eax, VAR_XP_LOW
 464
 465         movl    %ebx, PARAM_XP
 466         movl    PARAM_YP, %ebx
 467
 468         leal    (%edi,%edx,4), %ecx     C wp adjust for ysize indexing
 469         movl    4(%ebx), %ebp           C multiplier (yp second limb)
 470
 471         leal    4(%ebx,%edx,4), %ebx    C yp adjust for ysize indexing
 472
 473         movl    %ecx, PARAM_WP
 474
 475         leal    1(%esi), %ecx   C adjust parity for decl %ecx above
 476
 477         movl    %ebx, PARAM_YP
 478         negl    %edx
 479
 480         movl    %edx, PARAM_YSIZE
 481         jmp     L(unroll_outer_entry)
 482
 483
 484 ifdef(`PIC',`
 485 L(pic_calc):
 486         C See mpn/x86/README about old gas bugs
 487         leal    (%ecx,%esi,1), %ecx
 488         addl    $L(unroll_entry)-L(unroll_here), %ecx
 489         addl    (%esp), %ecx
 490         ret_internal
 491 ')
 492
 493
 494 C -----------------------------------------------------------------------------
 495         C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
 496         C cost any extra space, since the inner unrolled loop below is
 497         C aligned to 32.
 498         ALIGN(32)
 499 L(unroll_outer_top):
 500         C edx   ysize
 501
 502         movl    PARAM_YP, %eax
 503         movl    %edx, PARAM_YSIZE       C incremented ysize counter
 504
 505         movl    PARAM_WP, %edi
 506
 507         movl    VAR_COUNTER_INIT, %ebx
 508         movl    (%eax,%edx,4), %ebp     C next multiplier
 509
 510         movl    PARAM_XSIZE, %ecx
 511         leal    (%edi,%edx,4), %edi     C adjust wp for where we are in yp
 512
 513         movl    VAR_XP_LOW, %eax
 514         movl    %ebx, VAR_COUNTER
 515
 516 L(unroll_outer_entry):
 517         mull    %ebp
 518
 519         C using testb is a tiny bit faster than testl
 520         testb   $1, %cl
 521
 522         movl    %eax, %ecx      C low carry
 523         movl    VAR_JMP, %eax
 524
 525         movl    %edx, %esi      C high carry
 526         movl    PARAM_XP, %ebx
 527
 528         jnz     L(unroll_noswap)
 529         movl    %ecx, %esi      C high,low carry other way around
 530
 531         movl    %edx, %ecx
 532 L(unroll_noswap):
 533
 534         jmp     *%eax
 535
 536
 537
 538 C -----------------------------------------------------------------------------
 539         ALIGN(32)
 540 L(unroll_top):
 541         C eax   scratch
 542         C ebx   xp
 543         C ecx   carry low
 544         C edx   scratch
 545         C esi   carry high
 546         C edi   wp
 547         C ebp   multiplier
 548         C VAR_COUNTER  loop counter
 549         C
 550         C 15 code bytes each limb
 551
 552         leal    UNROLL_BYTES(%edi), %edi
 553
 554 L(unroll_entry):
 555 deflit(CHUNK_COUNT,2)
 556 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 557         deflit(`disp0', eval(i*CHUNK_COUNT*4))
 558         deflit(`disp1', eval(disp0 + 4))
 559         deflit(`disp2', eval(disp1 + 4))
 560
 561         movl    disp1(%ebx), %eax
 562         mull    %ebp
 563 Zdisp(  addl,   %ecx, disp0,(%edi))
 564         adcl    %eax, %esi
 565         movl    %edx, %ecx
 566         jadcl0( %ecx)
 567
 568         movl    disp2(%ebx), %eax
 569         mull    %ebp
 570         addl    %esi, disp1(%edi)
 571         adcl    %eax, %ecx
 572         movl    %edx, %esi
 573         jadcl0( %esi)
 574 ')
 575
 576         decl    VAR_COUNTER
 577         leal    UNROLL_BYTES(%ebx), %ebx
 578
 579         jns     L(unroll_top)
 580
 581
 582         movl    PARAM_YSIZE, %edx
 583         addl    %ecx, UNROLL_BYTES(%edi)
 584
 585         adcl    $0, %esi
 586
 587         incl    %edx
 588         movl    %esi, UNROLL_BYTES+4(%edi)
 589
 590         jnz     L(unroll_outer_top)
 591
 592
 593         movl    SAVE_ESI, %esi
 594         movl    SAVE_EBP, %ebp
 595         movl    SAVE_EDI, %edi
 596         movl    SAVE_EBX, %ebx
 597
 598         addl    $FRAME, %esp
 599         ret
 600
 601 EPILOGUE()