mpn/x86/p6/mmx/divrem_1.asm

   1 dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
  24
  25
  26 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  27 C                         mp_srcptr src, mp_size_t size,
  28 C                         mp_limb_t divisor);
  29 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  30 C                          mp_srcptr src, mp_size_t size,
  31 C                          mp_limb_t divisor, mp_limb_t carry);
  32 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
  33 C                                mp_srcptr src, mp_size_t size,
  34 C                                mp_limb_t divisor, mp_limb_t inverse,
  35 C                                unsigned shift);
  36 C
  37 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
  38 C see that file for some comments.  It's possible what's here can be improved.
  39
  40
  41 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  42 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  43 dnl
  44 dnl  The different speeds of the integer and fraction parts means that using
  45 dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
  46 dnl  for the integer part and a bit lower for the fraction part.  (Or what's
  47 dnl  really wanted is to speed up the integer part!)
  48 dnl
  49 dnl  The threshold is set to make the integer part right.  At 4 limbs the
  50 dnl  div and mul are about the same there, but on the fractional part the
  51 dnl  mul is much faster.
  52
  53 deflit(MUL_THRESHOLD, 4)
  54
  55
  56 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
  57 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
  58 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
  59 defframe(PARAM_DIVISOR,20)
  60 defframe(PARAM_SIZE,   16)
  61 defframe(PARAM_SRC,    12)
  62 defframe(PARAM_XSIZE,  8)
  63 defframe(PARAM_DST,    4)
  64
  65 defframe(SAVE_EBX,    -4)
  66 defframe(SAVE_ESI,    -8)
  67 defframe(SAVE_EDI,    -12)
  68 defframe(SAVE_EBP,    -16)
  69
  70 defframe(VAR_NORM,    -20)
  71 defframe(VAR_INVERSE, -24)
  72 defframe(VAR_SRC,     -28)
  73 defframe(VAR_DST,     -32)
  74 defframe(VAR_DST_STOP,-36)
  75
  76 deflit(STACK_SPACE, 36)
  77
  78         TEXT
  79         ALIGN(16)
  80
  81 PROLOGUE(mpn_preinv_divrem_1)
  82 deflit(`FRAME',0)
  83         movl    PARAM_XSIZE, %ecx
  84         subl    $STACK_SPACE, %esp      FRAME_subl_esp(STACK_SPACE)
  85
  86         movl    %esi, SAVE_ESI
  87         movl    PARAM_SRC, %esi
  88
  89         movl    %ebx, SAVE_EBX
  90         movl    PARAM_SIZE, %ebx
  91
  92         movl    %ebp, SAVE_EBP
  93         movl    PARAM_DIVISOR, %ebp
  94
  95         movl    %edi, SAVE_EDI
  96         movl    PARAM_DST, %edx
  97
  98         movl    -4(%esi,%ebx,4), %eax   C src high limb
  99         xorl    %edi, %edi              C initial carry (if can't skip a div)
 100
 101         C
 102
 103         leal    8(%edx,%ecx,4), %edx    C &dst[xsize+2]
 104         xor     %ecx, %ecx
 105
 106         movl    %edx, VAR_DST_STOP      C &dst[xsize+2]
 107         cmpl    %ebp, %eax              C high cmp divisor
 108
 109         cmovc(  %eax, %edi)             C high is carry if high<divisor
 110
 111         cmovnc( %eax, %ecx)             C 0 if skip div, src high if not
 112                                         C (the latter in case src==dst)
 113
 114         movl    %ecx, -12(%edx,%ebx,4)  C dst high limb
 115
 116         sbbl    $0, %ebx                C skip one division if high<divisor
 117         movl    PARAM_PREINV_SHIFT, %ecx
 118
 119         leal    -8(%edx,%ebx,4), %edx   C &dst[xsize+size]
 120         movl    $32, %eax
 121
 122         movl    %edx, VAR_DST           C &dst[xsize+size]
 123
 124         shll    %cl, %ebp               C d normalized
 125         subl    %ecx, %eax
 126         movl    %ecx, VAR_NORM
 127
 128         movd    %eax, %mm7              C rshift
 129         movl    PARAM_PREINV_INVERSE, %eax
 130         jmp     L(start_preinv)
 131
 132 EPILOGUE()
 133
 134
 135
 136         ALIGN(16)
 137
 138 PROLOGUE(mpn_divrem_1c)
 139 deflit(`FRAME',0)
 140         movl    PARAM_CARRY, %edx
 141
 142         movl    PARAM_SIZE, %ecx
 143         subl    $STACK_SPACE, %esp
 144 deflit(`FRAME',STACK_SPACE)
 145
 146         movl    %ebx, SAVE_EBX
 147         movl    PARAM_XSIZE, %ebx
 148
 149         movl    %edi, SAVE_EDI
 150         movl    PARAM_DST, %edi
 151
 152         movl    %ebp, SAVE_EBP
 153         movl    PARAM_DIVISOR, %ebp
 154
 155         movl    %esi, SAVE_ESI
 156         movl    PARAM_SRC, %esi
 157
 158         leal    -4(%edi,%ebx,4), %edi
 159         jmp     L(start_1c)
 160
 161 EPILOGUE()
 162
 163
 164         C offset 0x31, close enough to aligned
 165 PROLOGUE(mpn_divrem_1)
 166 deflit(`FRAME',0)
 167
 168         movl    PARAM_SIZE, %ecx
 169         movl    $0, %edx                C initial carry (if can't skip a div)
 170         subl    $STACK_SPACE, %esp
 171 deflit(`FRAME',STACK_SPACE)
 172
 173         movl    %ebp, SAVE_EBP
 174         movl    PARAM_DIVISOR, %ebp
 175
 176         movl    %ebx, SAVE_EBX
 177         movl    PARAM_XSIZE, %ebx
 178
 179         movl    %esi, SAVE_ESI
 180         movl    PARAM_SRC, %esi
 181         orl     %ecx, %ecx              C size
 182
 183         movl    %edi, SAVE_EDI
 184         movl    PARAM_DST, %edi
 185
 186         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 187         jz      L(no_skip_div)          C if size==0
 188
 189         movl    -4(%esi,%ecx,4), %eax   C src high limb
 190         xorl    %esi, %esi
 191         cmpl    %ebp, %eax              C high cmp divisor
 192
 193         cmovc(  %eax, %edx)             C high is carry if high<divisor
 194
 195         cmovnc( %eax, %esi)             C 0 if skip div, src high if not
 196                                         C (the latter in case src==dst)
 197
 198         movl    %esi, (%edi,%ecx,4)     C dst high limb
 199
 200         sbbl    $0, %ecx                C size-1 if high<divisor
 201         movl    PARAM_SRC, %esi         C reload
 202 L(no_skip_div):
 203
 204
 205 L(start_1c):
 206         C eax
 207         C ebx   xsize
 208         C ecx   size
 209         C edx   carry
 210         C esi   src
 211         C edi   &dst[xsize-1]
 212         C ebp   divisor
 213
 214         leal    (%ebx,%ecx), %eax       C size+xsize
 215         cmpl    $MUL_THRESHOLD, %eax
 216         jae     L(mul_by_inverse)
 217
 218         orl     %ecx, %ecx
 219         jz      L(divide_no_integer)
 220
 221 L(divide_integer):
 222         C eax   scratch (quotient)
 223         C ebx   xsize
 224         C ecx   counter
 225         C edx   scratch (remainder)
 226         C esi   src
 227         C edi   &dst[xsize-1]
 228         C ebp   divisor
 229
 230         movl    -4(%esi,%ecx,4), %eax
 231
 232         divl    %ebp
 233
 234         movl    %eax, (%edi,%ecx,4)
 235         decl    %ecx
 236         jnz     L(divide_integer)
 237
 238
 239 L(divide_no_integer):
 240         movl    PARAM_DST, %edi
 241         orl     %ebx, %ebx
 242         jnz     L(divide_fraction)
 243
 244 L(divide_done):
 245         movl    SAVE_ESI, %esi
 246
 247         movl    SAVE_EDI, %edi
 248
 249         movl    SAVE_EBX, %ebx
 250         movl    %edx, %eax
 251
 252         movl    SAVE_EBP, %ebp
 253         addl    $STACK_SPACE, %esp
 254
 255         ret
 256
 257
 258 L(divide_fraction):
 259         C eax   scratch (quotient)
 260         C ebx   counter
 261         C ecx
 262         C edx   scratch (remainder)
 263         C esi
 264         C edi   dst
 265         C ebp   divisor
 266
 267         movl    $0, %eax
 268
 269         divl    %ebp
 270
 271         movl    %eax, -4(%edi,%ebx,4)
 272         decl    %ebx
 273         jnz     L(divide_fraction)
 274
 275         jmp     L(divide_done)
 276
 277
 278
 279 C -----------------------------------------------------------------------------
 280
 281 L(mul_by_inverse):
 282         C eax
 283         C ebx   xsize
 284         C ecx   size
 285         C edx   carry
 286         C esi   src
 287         C edi   &dst[xsize-1]
 288         C ebp   divisor
 289
 290         leal    12(%edi), %ebx          C &dst[xsize+2], loop dst stop
 291
 292         movl    %ebx, VAR_DST_STOP
 293         leal    4(%edi,%ecx,4), %edi    C &dst[xsize+size]
 294
 295         movl    %edi, VAR_DST
 296         movl    %ecx, %ebx              C size
 297
 298         bsrl    %ebp, %ecx              C 31-l
 299         movl    %edx, %edi              C carry
 300
 301         leal    1(%ecx), %eax           C 32-l
 302         xorl    $31, %ecx               C l
 303
 304         movl    %ecx, VAR_NORM
 305         movl    $-1, %edx
 306
 307         shll    %cl, %ebp               C d normalized
 308         movd    %eax, %mm7
 309
 310         movl    $-1, %eax
 311         subl    %ebp, %edx              C (b-d)-1 giving edx:eax = b*(b-d)-1
 312
 313         divl    %ebp                    C floor (b*(b-d)-1) / d
 314
 315 L(start_preinv):
 316         C eax   inverse
 317         C ebx   size
 318         C ecx   shift
 319         C edx
 320         C esi   src
 321         C edi   carry
 322         C ebp   divisor
 323         C
 324         C mm7   rshift
 325
 326         movl    %eax, VAR_INVERSE
 327         orl     %ebx, %ebx              C size
 328         leal    -12(%esi,%ebx,4), %eax  C &src[size-3]
 329
 330         movl    %eax, VAR_SRC
 331         jz      L(start_zero)
 332
 333         movl    8(%eax), %esi           C src high limb
 334         cmpl    $1, %ebx
 335         jz      L(start_one)
 336
 337 L(start_two_or_more):
 338         movl    4(%eax), %edx           C src second highest limb
 339
 340         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 341
 342         shldl(  %cl, %edx, %esi)        C n10 = high,second << l
 343
 344         cmpl    $2, %ebx
 345         je      L(integer_two_left)
 346         jmp     L(integer_top)
 347
 348
 349 L(start_one):
 350         shldl(  %cl, %esi, %edi)        C n2 = carry,high << l
 351
 352         shll    %cl, %esi               C n10 = high << l
 353         jmp     L(integer_one_left)
 354
 355
 356 L(start_zero):
 357         C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
 358         C skipped a division.
 359
 360         shll    %cl, %edi               C n2 = carry << l
 361         movl    %edi, %eax              C return value for zero_done
 362         cmpl    $0, PARAM_XSIZE
 363
 364         je      L(zero_done)
 365         jmp     L(fraction_some)
 366
 367
 368
 369 C -----------------------------------------------------------------------------
 370 C
 371 C This loop runs at about 25 cycles, which is probably sub-optimal, and
 372 C certainly more than the dependent chain would suggest.  A better loop, or
 373 C a better rough analysis of what's possible, would be welcomed.
 374 C
 375 C In the current implementation, the following successively dependent
 376 C micro-ops seem to exist.
 377 C
 378 C                      uops
 379 C               n2+n1   1   (addl)
 380 C               mul     5
 381 C               q1+1    3   (addl/adcl)
 382 C               mul     5
 383 C               sub     3   (subl/sbbl)
 384 C               addback 2   (cmov)
 385 C                      ---
 386 C                      19
 387 C
 388 C Lack of registers hinders explicit scheduling and it might be that the
 389 C normal out of order execution isn't able to hide enough under the mul
 390 C latencies.
 391 C
 392 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
 393 C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
 394 C combination was tried for the addback (despite the fact it would lengthen
 395 C the dependent chain) but found to be no faster.
 396
 397
 398         ALIGN(16)
 399 L(integer_top):
 400         C eax   scratch
 401         C ebx   scratch (nadj, q1)
 402         C ecx   scratch (src, dst)
 403         C edx   scratch
 404         C esi   n10
 405         C edi   n2
 406         C ebp   d
 407         C
 408         C mm0   scratch (src qword)
 409         C mm7   rshift for normalization
 410
 411         movl    %esi, %eax
 412         movl    %ebp, %ebx
 413
 414         sarl    $31, %eax          C -n1
 415         movl    VAR_SRC, %ecx
 416
 417         andl    %eax, %ebx         C -n1 & d
 418         negl    %eax               C n1
 419
 420         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 421         addl    %edi, %eax         C n2+n1
 422         movq    (%ecx), %mm0       C next src limb and the one below it
 423
 424         mull    VAR_INVERSE        C m*(n2+n1)
 425
 426         subl    $4, %ecx
 427
 428         movl    %ecx, VAR_SRC
 429
 430         C
 431
 432         C
 433
 434         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 435         movl    %ebp, %eax         C d
 436         leal    1(%edi), %ebx      C n2+1
 437
 438         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 439         jz      L(q1_ff)
 440
 441         mull    %ebx               C (q1+1)*d
 442
 443         movl    VAR_DST, %ecx
 444         psrlq   %mm7, %mm0
 445
 446         C
 447
 448         C
 449
 450         C
 451
 452         subl    %eax, %esi
 453         movl    VAR_DST_STOP, %eax
 454
 455         sbbl    %edx, %edi         C n - (q1+1)*d
 456         movl    %esi, %edi         C remainder -> n2
 457         leal    (%ebp,%esi), %edx
 458
 459         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 460         movd    %mm0, %esi
 461
 462         sbbl    $0, %ebx           C q
 463         subl    $4, %ecx
 464
 465         movl    %ebx, (%ecx)
 466         cmpl    %eax, %ecx
 467
 468         movl    %ecx, VAR_DST
 469         jne     L(integer_top)
 470
 471
 472 L(integer_loop_done):
 473
 474
 475 C -----------------------------------------------------------------------------
 476 C
 477 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
 478 C q1_ff special case.  This make the code a bit smaller and simpler, and
 479 C costs only 2 cycles (each).
 480
 481 L(integer_two_left):
 482         C eax   scratch
 483         C ebx   scratch (nadj, q1)
 484         C ecx   scratch (src, dst)
 485         C edx   scratch
 486         C esi   n10
 487         C edi   n2
 488         C ebp   divisor
 489         C
 490         C mm7   rshift
 491
 492
 493         movl    %esi, %eax
 494         movl    %ebp, %ebx
 495
 496         sarl    $31, %eax          C -n1
 497         movl    PARAM_SRC, %ecx
 498
 499         andl    %eax, %ebx         C -n1 & d
 500         negl    %eax               C n1
 501
 502         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 503         addl    %edi, %eax         C n2+n1
 504
 505         mull    VAR_INVERSE        C m*(n2+n1)
 506
 507         movd    (%ecx), %mm0       C src low limb
 508
 509         movl    VAR_DST_STOP, %ecx
 510
 511         C
 512
 513         C
 514
 515         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 516         leal    1(%edi), %ebx      C n2+1
 517         movl    %ebp, %eax         C d
 518
 519         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 520
 521         sbbl    $0, %ebx
 522
 523         mull    %ebx               C (q1+1)*d
 524
 525         psllq   $32, %mm0
 526
 527         psrlq   %mm7, %mm0
 528
 529         C
 530
 531         C
 532
 533         subl    %eax, %esi
 534
 535         sbbl    %edx, %edi         C n - (q1+1)*d
 536         movl    %esi, %edi         C remainder -> n2
 537         leal    (%ebp,%esi), %edx
 538
 539         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 540         movd    %mm0, %esi
 541
 542         sbbl    $0, %ebx           C q
 543
 544         movl    %ebx, -4(%ecx)
 545
 546
 547 C -----------------------------------------------------------------------------
 548 L(integer_one_left):
 549         C eax   scratch
 550         C ebx   scratch (nadj, q1)
 551         C ecx   scratch (dst)
 552         C edx   scratch
 553         C esi   n10
 554         C edi   n2
 555         C ebp   divisor
 556         C
 557         C mm7   rshift
 558
 559
 560         movl    %esi, %eax
 561         movl    %ebp, %ebx
 562
 563         sarl    $31, %eax          C -n1
 564         movl    VAR_DST_STOP, %ecx
 565
 566         andl    %eax, %ebx         C -n1 & d
 567         negl    %eax               C n1
 568
 569         addl    %esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
 570         addl    %edi, %eax         C n2+n1
 571
 572         mull    VAR_INVERSE        C m*(n2+n1)
 573
 574         C
 575
 576         C
 577
 578         C
 579
 580         addl    %ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
 581         leal    1(%edi), %ebx      C n2+1
 582         movl    %ebp, %eax         C d
 583
 584         C
 585
 586         adcl    %edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
 587
 588         sbbl    $0, %ebx           C q1 if q1+1 overflowed
 589
 590         mull    %ebx
 591
 592         C
 593
 594         C
 595
 596         C
 597
 598         C
 599
 600         subl    %eax, %esi
 601         movl    PARAM_XSIZE, %eax
 602
 603         sbbl    %edx, %edi         C n - (q1+1)*d
 604         movl    %esi, %edi         C remainder -> n2
 605         leal    (%ebp,%esi), %edx
 606
 607         cmovc(  %edx, %edi)        C n - q1*d if underflow from using q1+1
 608
 609         sbbl    $0, %ebx           C q
 610
 611         movl    %ebx, -8(%ecx)
 612         subl    $8, %ecx
 613
 614
 615
 616         orl     %eax, %eax         C xsize
 617         jnz     L(fraction_some)
 618
 619         movl    %edi, %eax
 620 L(fraction_done):
 621         movl    VAR_NORM, %ecx
 622 L(zero_done):
 623         movl    SAVE_EBP, %ebp
 624
 625         movl    SAVE_EDI, %edi
 626
 627         movl    SAVE_ESI, %esi
 628
 629         movl    SAVE_EBX, %ebx
 630         addl    $STACK_SPACE, %esp
 631
 632         shrl    %cl, %eax
 633         emms
 634
 635         ret
 636
 637
 638 C -----------------------------------------------------------------------------
 639 C
 640 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
 641 C of q*d is simply -d and the remainder n-q*d = n10+d
 642
 643 L(q1_ff):
 644         C eax   (divisor)
 645         C ebx   (q1+1 == 0)
 646         C ecx
 647         C edx
 648         C esi   n10
 649         C edi   n2
 650         C ebp   divisor
 651
 652         movl    VAR_DST, %ecx
 653         movl    VAR_DST_STOP, %edx
 654         subl    $4, %ecx
 655
 656         movl    %ecx, VAR_DST
 657         psrlq   %mm7, %mm0
 658         leal    (%ebp,%esi), %edi       C n-q*d remainder -> next n2
 659
 660         movl    $-1, (%ecx)
 661         movd    %mm0, %esi              C next n10
 662
 663         cmpl    %ecx, %edx
 664         jne     L(integer_top)
 665
 666         jmp     L(integer_loop_done)
 667
 668
 669
 670 C -----------------------------------------------------------------------------
 671 C
 672 C In the current implementation, the following successively dependent
 673 C micro-ops seem to exist.
 674 C
 675 C                      uops
 676 C               mul     5
 677 C               q1+1    1   (addl)
 678 C               mul     5
 679 C               sub     3   (negl/sbbl)
 680 C               addback 2   (cmov)
 681 C                      ---
 682 C                      16
 683 C
 684 C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
 685 C the addback was found to be a touch slower.
 686
 687
 688         ALIGN(16)
 689 L(fraction_some):
 690         C eax
 691         C ebx
 692         C ecx
 693         C edx
 694         C esi
 695         C edi   carry
 696         C ebp   divisor
 697
 698         movl    PARAM_DST, %esi
 699         movl    VAR_DST_STOP, %ecx      C &dst[xsize+2]
 700         movl    %edi, %eax
 701
 702         subl    $8, %ecx                C &dst[xsize]
 703
 704
 705         ALIGN(16)
 706 L(fraction_top):
 707         C eax   n2, then scratch
 708         C ebx   scratch (nadj, q1)
 709         C ecx   dst, decrementing
 710         C edx   scratch
 711         C esi   dst stop point
 712         C edi   n2
 713         C ebp   divisor
 714
 715         mull    VAR_INVERSE     C m*n2
 716
 717         movl    %ebp, %eax      C d
 718         subl    $4, %ecx        C dst
 719         leal    1(%edi), %ebx
 720
 721         C
 722
 723         C
 724
 725         C
 726
 727         addl    %edx, %ebx      C 1 + high(n2<<32 + m*n2) = q1+1
 728
 729         mull    %ebx            C (q1+1)*d
 730
 731         C
 732
 733         C
 734
 735         C
 736
 737         C
 738
 739         negl    %eax            C low of n - (q1+1)*d
 740
 741         sbbl    %edx, %edi      C high of n - (q1+1)*d, caring only about carry
 742         leal    (%ebp,%eax), %edx
 743
 744         cmovc(  %edx, %eax)     C n - q1*d if underflow from using q1+1
 745
 746         sbbl    $0, %ebx        C q
 747         movl    %eax, %edi      C remainder->n2
 748         cmpl    %esi, %ecx
 749
 750         movl    %ebx, (%ecx)    C previous q
 751         jne     L(fraction_top)
 752
 753
 754         jmp     L(fraction_done)
 755
 756 EPILOGUE()