mpn/x86/k7/mmx/divrem_1.asm

   1 dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
   2 dnl  division.
   3
   4 dnl  Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23
  24 C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
  25
  26
  27 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  28 C                         mp_srcptr src, mp_size_t size,
  29 C                         mp_limb_t divisor);
  30 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  31 C                          mp_srcptr src, mp_size_t size,
  32 C                          mp_limb_t divisor, mp_limb_t carry);
  33 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
  34 C                                mp_srcptr src, mp_size_t size,
  35 C                                mp_limb_t divisor, mp_limb_t inverse,
  36 C                                unsigned shift);
  37 C
  38 C Algorithm:
  39 C
  40 C The method and nomenclature follow part 8 of "Division by Invariant
  41 C Integers using Multiplication" by Granlund and Montgomery, reference in
  42 C gmp.texi.
  43 C
  44 C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
  45 C for m', and "d" for d_norm, which won't cause any confusion since it's
  46 C only the normalized divisor that's of any use in the code.  "b" is written
  47 C for 2^N, the size of a limb, N being 32 here.
  48 C
  49 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
  50 C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer.  If
  51 C q1==0xFFFFFFFF, then q1+1 would overflow.  We branch to a special case
  52 C "q1_ff" if this occurs.  Since the true quotient is either q1 or q1+1 then
  53 C if q1==0xFFFFFFFF that must be the right value.
  54 C
  55 C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
  56 C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1.  This
  57 C then goes through as normal, and finding no addback required.  sbbl costs
  58 C an extra cycle over what the main loop code does, but it keeps code size
  59 C and complexity down.
  60 C
  61 C Notes:
  62 C
  63 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
  64 C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
  65 C carry, since in normal circumstances that will be a very rare event.
  66 C
  67 C The test for skipping a division is branch free (once size>=1 is tested).
  68 C The store to the destination high limb is 0 when a divide is skipped, or
  69 C if it's not skipped then a copy of the src high limb is used.  The latter
  70 C is in case src==dst.
  71 C
  72 C There's a small bias towards expecting xsize==0, by having code for
  73 C xsize==0 in a straight line and xsize!=0 under forward jumps.
  74 C
  75 C Alternatives:
  76 C
  77 C If the divisor is normalized (high bit set) then a division step can
  78 C always be skipped, since the high destination limb is always 0 or 1 in
  79 C that case.  It doesn't seem worth checking for this though, since it
  80 C probably occurs infrequently, in particular note that big_base for a
  81 C decimal mpn_get_str is not normalized in a 32-bit limb.
  82
  83
  84 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  85 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  86 dnl
  87 dnl  The inverse takes about 50 cycles to calculate, but after that the
  88 dnl  multiply is 17 c/l versus division at 42 c/l.
  89 dnl
  90 dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
  91 dnl  even more so on the fractional part.
  92
  93 deflit(MUL_THRESHOLD, 3)
  94
  95
  96 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
  97 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
  98 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
  99 defframe(PARAM_DIVISOR,20)
 100 defframe(PARAM_SIZE,   16)
 101 defframe(PARAM_SRC,    12)
 102 defframe(PARAM_XSIZE,  8)
 103 defframe(PARAM_DST,    4)
 104
 105 defframe(SAVE_EBX,    -4)
 106 defframe(SAVE_ESI,    -8)
 107 defframe(SAVE_EDI,    -12)
 108 defframe(SAVE_EBP,    -16)
 109
 110 defframe(VAR_NORM,    -20)
 111 defframe(VAR_INVERSE, -24)
 112 defframe(VAR_SRC,     -28)
 113 defframe(VAR_DST,     -32)
 114 defframe(VAR_DST_STOP,-36)
 115
 116 deflit(STACK_SPACE, 36)
 117
 118         TEXT
 119         ALIGN(32)
 120
 121 PROLOGUE(mpn_preinv_divrem_1)
 122 deflit(`FRAME',0)
 123         movl    PARAM_XSIZE, %ecx
 124         movl    PARAM_DST, %edx
 125         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
 126
 127         movl    %esi, SAVE_ESI
 128         movl    PARAM_SRC, %esi
 129
 130         movl    %ebx, SAVE_EBX
 131         movl    PARAM_SIZE, %ebx
 132
 133         leal    8(%edx,%ecx,4), %edx    C &dst[xsize+2]
 134         movl    %ebp, SAVE_EBP
 135         movl    PARAM_DIVISOR, %ebp
 136
 137         movl    %edx, VAR_DST_STOP      C &dst[xsize+2]
 138         movl    %edi, SAVE_EDI
 139         xorl    %edi, %edi              C carry
 140
 141         movl    -4(%esi,%ebx,4), %eax   C src high limb
 142         xor     %ecx, %ecx
 143
 144         C
 145
 146         C
 147
 148         cmpl    %ebp, %eax              C high cmp divisor
 149
 150         cmovc(  %eax, %edi)             C high is carry if high<divisor
 151         cmovnc( %eax, %ecx)             C 0 if skip div, src high if not
 152                                         C (the latter in case src==dst)
 153
 154         movl    %ecx, -12(%edx,%ebx,4)  C dst high limb
 155         sbbl    $0, %ebx                C skip one division if high<divisor
 156         movl    PARAM_PREINV_SHIFT, %ecx
 157
 158         leal    -8(%edx,%ebx,4), %edx   C &dst[xsize+size]
 159         movl    $32, %eax
 160
 161         movl    %edx, VAR_DST           C &dst[xsize+size]
 162
 163         shll    %cl, %ebp               C d normalized
 164         subl    %ecx, %eax
 165         movl    %ecx, VAR_NORM
 166
 167         movd    %eax, %mm7              C rshift
 168         movl    PARAM_PREINV_INVERSE, %eax
 169         jmp     L(start_preinv)
 170
 171 EPILOGUE()
 172
 173
 174         ALIGN(16)
 175
 176 PROLOGUE(mpn_divrem_1c)
 177 deflit(`FRAME',0)
 178         movl    PARAM_CARRY, %edx
 179         movl    PARAM_SIZE, %ecx
 180         subl    $STACK_SPACE, %esp
 181 deflit(`FRAME',STACK_SPACE)
 182
 183         movl    %ebx, SAVE_EBX
 184         movl    PARAM_XSIZE, %ebx
 185
 186         movl    %edi, SAVE_EDI
 187         movl    PARAM_DST, %edi
 188
 189         movl    %ebp, SAVE_EBP
 190         movl    PARAM_DIVISOR, %ebp
 191
 192         movl    %esi, SAVE_ESI
 193         movl    PARAM_SRC, %esi
 194
 195         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 196         jmp     L(start_1c)
 197
 198 EPILOGUE()
 199
 200
 201         C offset 0xa1, close enough to aligned
 202 PROLOGUE(mpn_divrem_1)
 203 deflit(`FRAME',0)
 204
 205         movl    PARAM_SIZE, %ecx
 206         movl    $0, %edx                C initial carry (if can't skip a div)
 207         subl    $STACK_SPACE, %esp
 208 deflit(`FRAME',STACK_SPACE)
 209
 210         movl    %esi, SAVE_ESI
 211         movl    PARAM_SRC, %esi
 212
 213         movl    %ebx, SAVE_EBX
 214         movl    PARAM_XSIZE, %ebx
 215
 216         movl    %ebp, SAVE_EBP
 217         movl    PARAM_DIVISOR, %ebp
 218         orl     %ecx, %ecx              C size
 219
 220         movl    %edi, SAVE_EDI
 221         movl    PARAM_DST, %edi
 222         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 223
 224         jz      L(no_skip_div)          C if size==0
 225         movl    -4(%esi,%ecx,4), %eax   C src high limb
 226         xorl    %esi, %esi
 227
 228         cmpl    %ebp, %eax              C high cmp divisor
 229
 230         cmovc(  %eax, %edx)             C high is carry if high<divisor
 231         cmovnc( %eax, %esi)             C 0 if skip div, src high if not
 232
 233         movl    %esi, (%edi,%ecx,4)     C dst high limb
 234         sbbl    $0, %ecx                C size-1 if high<divisor
 235         movl    PARAM_SRC, %esi         C reload
 236 L(no_skip_div):
 237
 238
 239 L(start_1c):
 240         C eax
 241         C ebx   xsize
 242         C ecx   size
 243         C edx   carry
 244         C esi   src
 245         C edi   &dst[xsize-1]
 246         C ebp   divisor
 247
 248         leal    (%ebx,%ecx), %eax       C size+xsize
 249         cmpl    $MUL_THRESHOLD, %eax
 250         jae     L(mul_by_inverse)
 251
 252
 253 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
 254 C It'd be possible to write them out without the looping, but no speedup
 255 C would be expected.
 256 C
 257 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
 258 C integer part, but curiously not on the fractional part, where %ebp is a
 259 C (fixed) couple of cycles faster.
 260
 261         orl     %ecx, %ecx
 262         jz      L(divide_no_integer)
 263
 264 L(divide_integer):
 265         C eax   scratch (quotient)
 266         C ebx   xsize
 267         C ecx   counter
 268         C edx   scratch (remainder)
 269         C esi   src
 270         C edi   &dst[xsize-1]
 271         C ebp   divisor
 272
 273         movl    -4(%esi,%ecx,4), %eax
 274
 275         divl    PARAM_DIVISOR
 276
 277         movl    %eax, (%edi,%ecx,4)
 278         decl    %ecx
 279         jnz     L(divide_integer)
 280
 281
 282 L(divide_no_integer):
 283         movl    PARAM_DST, %edi
 284         orl     %ebx, %ebx
 285         jnz     L(divide_fraction)
 286
 287 L(divide_done):
 288         movl    SAVE_ESI, %esi
 289         movl    SAVE_EDI, %edi
 290         movl    %edx, %eax
 291
 292         movl    SAVE_EBX, %ebx
 293         movl    SAVE_EBP, %ebp
 294         addl    $STACK_SPACE, %esp
 295
 296         ret
 297
 298
 299 L(divide_fraction):
 300         C eax   scratch (quotient)
 301         C ebx   counter
 302         C ecx
 303         C edx   scratch (remainder)
 304         C esi
 305         C edi   dst
 306         C ebp   divisor
 307
 308         movl    $0, %eax
 309
 310         divl    %ebp
 311
 312         movl    %eax, -4(%edi,%ebx,4)
 313         decl    %ebx
 314         jnz     L(divide_fraction)
 315
 316         jmp     L(divide_done)
 317
 318
 319
 320 C -----------------------------------------------------------------------------
 321
 322 L(mul_by_inverse):
 323         C eax
 324         C ebx   xsize
 325         C ecx   size
 326         C edx   carry
 327         C esi   src
 328         C edi   &dst[xsize-1]
 329         C ebp   divisor
 330
 331         bsrl    %ebp, %eax              C 31-l
 332
 333         leal    12(%edi), %ebx          C &dst[xsize+2], loop dst stop
 334         leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
 335
 336         movl    %edi, VAR_DST
 337         movl    %ebx, VAR_DST_STOP
 338
 339         movl    %ecx, %ebx              C size
 340         movl    $31, %ecx
 341
 342         movl    %edx, %edi              C carry
 343         movl    $-1, %edx
 344
 345         C
 346
 347         xorl    %eax, %ecx              C l
 348         incl    %eax                    C 32-l
 349
 350         shll    %cl, %ebp               C d normalized
 351         movl    %ecx, VAR_NORM
 352
 353         movd    %eax, %mm7
 354
 355         movl    $-1, %eax
 356         subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
 357
 358         divl    %ebp                    C floor (b*(b-d)-1) / d
 359
 360 L(start_preinv):
 361         C eax   inverse
 362         C ebx   size
 363         C ecx   shift
 364         C edx
 365         C esi   src
 366         C edi   carry
 367         C ebp   divisor
 368         C
 369         C mm7   rshift
 370
 371         orl     %ebx, %ebx              C size
 372         movl    %eax, VAR_INVERSE
 373         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 374
 375         jz      L(start_zero)
 376         movl    %eax, VAR_SRC
 377         cmpl    $1, %ebx
 378
 379         movl    8(%eax), %esi           C src high limb
 380         jz      L(start_one)
 381
 382 L(start_two_or_more):
 383         movl    4(%eax), %edx           C src second highest limb
 384
 385         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 386
 387         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 388
 389         cmpl    $2, %ebx
 390         je      L(integer_two_left)
 391         jmp     L(integer_top)
 392
 393
 394 L(start_one):
 395         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 396
 397         shll    %cl, %esi               C n10 = high << l
 398         movl    %eax, VAR_SRC
 399         jmp     L(integer_one_left)
 400
 401
 402 L(start_zero):
 403         C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
 404         C skipped a division.
 405
 406         shll    %cl, %edi               C n2 = carry << l
 407         movl    %edi, %eax              C return value for zero_done
 408         cmpl    $0, PARAM_XSIZE
 409
 410         je      L(zero_done)
 411         jmp     L(fraction_some)
 412
 413
 414
 415 C -----------------------------------------------------------------------------
 416 C
 417 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
 418 C execution.  The instruction scheduling is important, with various
 419 C apparently equivalent forms running 1 to 5 cycles slower.
 420 C
 421 C A lower bound for the time would seem to be 16 cycles, based on the
 422 C following successive dependencies.
 423 C
 424 C                     cycles
 425 C               n2+n1   1
 426 C               mul     6
 427 C               q1+1    1
 428 C               mul     6
 429 C               sub     1
 430 C               addback 1
 431 C                      ---
 432 C                      16
 433 C
 434 C This chain is what the loop has already, but 16 cycles isn't achieved.
 435 C K7 has enough decode, and probably enough execute (depending maybe on what
 436 C a mul actually consumes), but nothing running under 17 has been found.
 437 C
 438 C In theory n2+n1 could be done in the sub and addback stages (by
 439 C calculating both n2 and n2+n1 there), but lack of registers makes this an
 440 C unlikely proposition.
 441 C
 442 C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
 443 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
 444 C chain, and nothing better than 18 cycles has been found when using it.
 445 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
 446 C be an extremely rare event.
 447 C
 448 C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
 449 C if some special data is coming out with this always, the q1_ff special
 450 C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
 451 C induce the q1_ff case, for speed measurements or testing.  Note that
 452 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
 453 C
 454 C The instruction groupings and empty comments show the cycles for a naive
 455 C in-order view of the code (conveniently ignoring the load latency on
 456 C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
 457 C to the extent that out-of-order execution rearranges it.  In this case
 458 C there's 19 cycles shown, but it executes at 17.
 459
 460         ALIGN(16)
 461 L(integer_top):
 462         C eax   scratch
 463         C ebx   scratch (nadj, q1)
 464         C ecx   scratch (src, dst)
 465         C edx   scratch
 466         C esi   n10
 467         C edi   n2
 468         C ebp   divisor
 469         C
 470         C mm0   scratch (src qword)
 471         C mm7   rshift for normalization
 472
 473         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 474         movl    %edi, %eax         C n2
 475         movl    VAR_SRC, %ecx
 476
 477         leal    (%ebp,%esi), %ebx
 478         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 479         sbbl    $-1, %eax          C n2+n1
 480
 481         mull    VAR_INVERSE        C m*(n2+n1)
 482
 483         movq    (%ecx), %mm0       C next limb and the one below it
 484         subl    $4, %ecx
 485
 486         movl    %ecx, VAR_SRC
 487
 488         C
 489
 490         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 491         leal    1(%edi), %ebx      C n2+1
 492         movl    %ebp, %eax         C d
 493
 494         C
 495
 496         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 497         jz      L(q1_ff)
 498         movl    VAR_DST, %ecx
 499
 500         mull    %ebx               C (q1+1)*d
 501
 502         psrlq   %mm7, %mm0
 503
 504         leal    -4(%ecx), %ecx
 505
 506         C
 507
 508         subl    %eax, %esi
 509         movl    VAR_DST_STOP, %eax
 510
 511         C
 512
 513         sbbl    %edx, %edi         C n - (q1+1)*d
 514         movl    %esi, %edi         C remainder -> n2
 515         leal    (%ebp,%esi), %edx
 516
 517         movd    %mm0, %esi
 518
 519         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 520         sbbl    $0, %ebx           C q
 521         cmpl    %eax, %ecx
 522
 523         movl    %ebx, (%ecx)
 524         movl    %ecx, VAR_DST
 525         jne     L(integer_top)
 526
 527
 528 L(integer_loop_done):
 529
 530
 531 C -----------------------------------------------------------------------------
 532 C
 533 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
 534 C q1_ff special case.  This make the code a bit smaller and simpler, and
 535 C costs only 1 cycle (each).
 536
 537 L(integer_two_left):
 538         C eax   scratch
 539         C ebx   scratch (nadj, q1)
 540         C ecx   scratch (src, dst)
 541         C edx   scratch
 542         C esi   n10
 543         C edi   n2
 544         C ebp   divisor
 545         C
 546         C mm7   rshift
 547
 548         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 549         movl    %edi, %eax         C n2
 550         movl    PARAM_SRC, %ecx
 551
 552         leal    (%ebp,%esi), %ebx
 553         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 554         sbbl    $-1, %eax          C n2+n1
 555
 556         mull    VAR_INVERSE        C m*(n2+n1)
 557
 558         movd    (%ecx), %mm0       C src low limb
 559
 560         movl    VAR_DST_STOP, %ecx
 561
 562         C
 563
 564         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 565         leal    1(%edi), %ebx      C n2+1
 566         movl    %ebp, %eax         C d
 567
 568         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 569
 570         sbbl    $0, %ebx
 571
 572         mull    %ebx               C (q1+1)*d
 573
 574         psllq   $32, %mm0
 575
 576         psrlq   %mm7, %mm0
 577
 578         C
 579
 580         subl    %eax, %esi
 581
 582         C
 583
 584         sbbl    %edx, %edi         C n - (q1+1)*d
 585         movl    %esi, %edi         C remainder -> n2
 586         leal    (%ebp,%esi), %edx
 587
 588         movd    %mm0, %esi
 589
 590         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 591         sbbl    $0, %ebx           C q
 592
 593         movl    %ebx, -4(%ecx)
 594
 595
 596 C -----------------------------------------------------------------------------
 597 L(integer_one_left):
 598         C eax   scratch
 599         C ebx   scratch (nadj, q1)
 600         C ecx   dst
 601         C edx   scratch
 602         C esi   n10
 603         C edi   n2
 604         C ebp   divisor
 605         C
 606         C mm7   rshift
 607
 608         movl    VAR_DST_STOP, %ecx
 609         cmpl    $0x80000000, %esi  C n1 as 0=c, 1=nc
 610         movl    %edi, %eax         C n2
 611
 612         leal    (%ebp,%esi), %ebx
 613         cmovc(  %esi, %ebx)        C nadj = n10 + (-n1 & d), ignoring overflow
 614         sbbl    $-1, %eax          C n2+n1
 615
 616         mull    VAR_INVERSE        C m*(n2+n1)
 617
 618         C
 619
 620         C
 621
 622         C
 623
 624         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 625         leal    1(%edi), %ebx      C n2+1
 626         movl    %ebp, %eax         C d
 627
 628         C
 629
 630         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 631
 632         sbbl    $0, %ebx           C q1 if q1+1 overflowed
 633
 634         mull    %ebx
 635
 636         C
 637
 638         C
 639
 640         C
 641
 642         subl    %eax, %esi
 643
 644         C
 645
 646         sbbl    %edx, %edi         C n - (q1+1)*d
 647         movl    %esi, %edi         C remainder -> n2
 648         leal    (%ebp,%esi), %edx
 649
 650         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 651         sbbl    $0, %ebx           C q
 652
 653         movl    %ebx, -8(%ecx)
 654         subl    $8, %ecx
 655
 656
 657
 658 L(integer_none):
 659         cmpl    $0, PARAM_XSIZE
 660         jne     L(fraction_some)
 661
 662         movl    %edi, %eax
 663 L(fraction_done):
 664         movl    VAR_NORM, %ecx
 665 L(zero_done):
 666         movl    SAVE_EBP, %ebp
 667
 668         movl    SAVE_EDI, %edi
 669         movl    SAVE_ESI, %esi
 670
 671         movl    SAVE_EBX, %ebx
 672         addl    $STACK_SPACE, %esp
 673
 674         shrl    %cl, %eax
 675         emms
 676
 677         ret
 678
 679
 680 C -----------------------------------------------------------------------------
 681 C
 682 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 683 C of q*d is simply -d and the remainder n-q*d = n10+d
 684
 685 L(q1_ff):
 686         C eax   (divisor)
 687         C ebx   (q1+1 == 0)
 688         C ecx
 689         C edx
 690         C esi   n10
 691         C edi   n2
 692         C ebp   divisor
 693
 694         movl    VAR_DST, %ecx
 695         movl    VAR_DST_STOP, %edx
 696         subl    $4, %ecx
 697
 698         psrlq   %mm7, %mm0
 699         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 700         movl    %ecx, VAR_DST
 701
 702         movd    %mm0, %esi              C next n10
 703
 704         movl    $-1, (%ecx)
 705         cmpl    %ecx, %edx
 706         jne     L(integer_top)
 707
 708         jmp     L(integer_loop_done)
 709
 710
 711
 712 C -----------------------------------------------------------------------------
 713 C
 714 C Being the fractional part, the "source" limbs are all zero, meaning
 715 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
 716 C
 717 C The loop runs at 15 cycles.  The dependent chain is the same as the
 718 C general case above, but without the n2+n1 stage (due to n1==0), so 15
 719 C would seem to be the lower bound.
 720 C
 721 C A not entirely obvious simplification is that q1+1 never overflows a limb,
 722 C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
 723 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
 724 C rnd() means rounding down to a multiple of d.
 725 C
 726 C       m*n2 + b*n2 <= m*(d-1) + b*(d-1)
 727 C                    = m*d + b*d - m - b
 728 C                    = floor((b(b-d)-1)/d)*d + b*d - m - b
 729 C                    = rnd(b(b-d)-1) + b*d - m - b
 730 C                    = rnd(b(b-d)-1 + b*d) - m - b
 731 C                    = rnd(b*b-1) - m - b
 732 C                    <= (b-2)*b
 733 C
 734 C Unchanged from the general case is that the final quotient limb q can be
 735 C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
 736 C equation 8.4 of the paper which simplifies as follows when n1==0 and
 737 C n0==0.
 738 C
 739 C       n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
 740 C
 741 C As before, the instruction groupings and empty comments show a naive
 742 C in-order view of the code, which is made a nonsense by out of order
 743 C execution.  There's 17 cycles shown, but it executes at 15.
 744 C
 745 C Rotating the store q and remainder->n2 instructions up to the top of the
 746 C loop gets the run time down from 16 to 15.
 747
 748         ALIGN(16)
 749 L(fraction_some):
 750         C eax
 751         C ebx
 752         C ecx
 753         C edx
 754         C esi
 755         C edi   carry
 756         C ebp   divisor
 757
 758         movl    PARAM_DST, %esi
 759         movl    VAR_DST_STOP, %ecx      C &dst[xsize+2]
 760         movl    %edi, %eax
 761
 762         subl    $8, %ecx                C &dst[xsize]
 763         jmp     L(fraction_entry)
 764
 765
 766         ALIGN(16)
 767 L(fraction_top):
 768         C eax   n2 carry, then scratch
 769         C ebx   scratch (nadj, q1)
 770         C ecx   dst, decrementing
 771         C edx   scratch
 772         C esi   dst stop point
 773         C edi   (will be n2)
 774         C ebp   divisor
 775
 776         movl    %ebx, (%ecx)    C previous q
 777         movl    %eax, %edi      C remainder->n2
 778
 779 L(fraction_entry):
 780         mull    VAR_INVERSE     C m*n2
 781
 782         movl    %ebp, %eax      C d
 783         subl    $4, %ecx        C dst
 784         leal    1(%edi), %ebx
 785
 786         C
 787
 788         C
 789
 790         C
 791
 792         C
 793
 794         addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
 795
 796         mull    %ebx            C (q1+1)*d
 797
 798         C
 799
 800         C
 801
 802         C
 803
 804         negl    %eax            C low of n - (q1+1)*d
 805
 806         C
 807
 808         sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
 809         leal    (%ebp,%eax), %edx
 810
 811         cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
 812         sbbl    $0, %ebx        C q
 813         cmpl    %esi, %ecx
 814
 815         jne     L(fraction_top)
 816
 817
 818         movl    %ebx, (%ecx)
 819         jmp     L(fraction_done)
 820
 821 EPILOGUE()