mpn/x86/pentium4/sse2/divrem_1.asm

   1 dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
   4 dnl  Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23
  24 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
  25
  26
  27 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  28 C                         mp_srcptr src, mp_size_t size,
  29 C                         mp_limb_t divisor);
  30 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  31 C                          mp_srcptr src, mp_size_t size,
  32 C                          mp_limb_t divisor, mp_limb_t carry);
  33 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
  34 C                                mp_srcptr src, mp_size_t size,
  35 C                                mp_limb_t divisor, mp_limb_t inverse,
  36 C                                unsigned shift);
  37 C
  38 C Algorithm:
  39 C
  40 C The method and nomenclature follow part 8 of "Division by Invariant
  41 C Integers using Multiplication" by Granlund and Montgomery, reference in
  42 C gmp.texi.
  43 C
  44 C "m" is written for what is m' in the paper, and "d" for d_norm, which
  45 C won't cause any confusion since it's only the normalized divisor that's of
  46 C any use in the code.  "b" is written for 2^N, the size of a limb, N being
  47 C 32 here.
  48 C
  49 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
  50 C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
  51 C us have just a psubq on the dependent chain.
  52 C
  53 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
  54 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
  55 C
  56 C Notes:
  57 C
  58 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
  59 C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
  60 C carry, since in normal circumstances that will be a very rare event.
  61 C
  62 C The test for skipping a division is branch free (once size>=1 is tested).
  63 C The store to the destination high limb is 0 when a divide is skipped, or
  64 C if it's not skipped then a copy of the src high limb is stored.  The
  65 C latter is in case src==dst.
  66 C
  67 C There's a small bias towards expecting xsize==0, by having code for
  68 C xsize==0 in a straight line and xsize!=0 under forward jumps.
  69 C
  70 C Enhancements:
  71 C
  72 C The loop measures 32 cycles, but the dependent chain would suggest it
  73 C could be done with 30.  Not sure where to start looking for the extras.
  74 C
  75 C Alternatives:
  76 C
  77 C If the divisor is normalized (high bit set) then a division step can
  78 C always be skipped, since the high destination limb is always 0 or 1 in
  79 C that case.  It doesn't seem worth checking for this though, since it
  80 C probably occurs infrequently.
  81
  82
  83 dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
  84 dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
  85 dnl
  86 dnl  The inverse takes about 80-90 cycles to calculate, but after that the
  87 dnl  multiply is 32 c/l versus division at about 58 c/l.
  88 dnl
  89 dnl  At 4 limbs the div is a touch faster than the mul (and of course
  90 dnl  simpler), so start the mul from 5 limbs.
  91
  92 deflit(MUL_THRESHOLD, 5)
  93
  94
  95 defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
  96 defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
  97 defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
  98 defframe(PARAM_DIVISOR,20)
  99 defframe(PARAM_SIZE,   16)
 100 defframe(PARAM_SRC,    12)
 101 defframe(PARAM_XSIZE,  8)
 102 defframe(PARAM_DST,    4)
 103
 104 dnl  re-use parameter space
 105 define(SAVE_ESI,`PARAM_SIZE')
 106 define(SAVE_EBP,`PARAM_SRC')
 107 define(SAVE_EDI,`PARAM_DIVISOR')
 108 define(SAVE_EBX,`PARAM_DST')
 109
 110         TEXT
 111
 112         ALIGN(16)
 113 PROLOGUE(mpn_preinv_divrem_1)
 114 deflit(`FRAME',0)
 115
 116         movl    PARAM_SIZE, %ecx
 117         xorl    %edx, %edx              C carry if can't skip a div
 118
 119         movl    %esi, SAVE_ESI
 120         movl    PARAM_SRC, %esi
 121
 122         movl    %ebp, SAVE_EBP
 123         movl    PARAM_DIVISOR, %ebp
 124
 125         movl    %edi, SAVE_EDI
 126         movl    PARAM_DST, %edi
 127
 128         movl    -4(%esi,%ecx,4), %eax   C src high limb
 129
 130         movl    %ebx, SAVE_EBX
 131         movl    PARAM_XSIZE, %ebx
 132
 133         movd    PARAM_PREINV_INVERSE, %mm4
 134
 135         movd    PARAM_PREINV_SHIFT, %mm7  C l
 136         cmpl    %ebp, %eax              C high cmp divisor
 137
 138         cmovc(  %eax, %edx)             C high is carry if high<divisor
 139         movd    %edx, %mm0              C carry
 140
 141         movd    %edx, %mm1              C carry
 142         movl    $0, %edx
 143
 144         movd    %ebp, %mm5              C d
 145         cmovnc( %eax, %edx)             C 0 if skip div, src high if not
 146                                         C (the latter in case src==dst)
 147         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 148
 149         movl    %edx, (%edi,%ecx,4)     C dst high limb
 150         sbbl    $0, %ecx                C skip one division if high<divisor
 151         movl    $32, %eax
 152
 153         subl    PARAM_PREINV_SHIFT, %eax
 154         psllq   %mm7, %mm5              C d normalized
 155         leal    (%edi,%ecx,4), %edi     C &dst[xsize+size-1]
 156         leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
 157
 158         movd    %eax, %mm6              C 32-l
 159         jmp     L(start_preinv)
 160
 161 EPILOGUE()
 162
 163
 164         ALIGN(16)
 165 PROLOGUE(mpn_divrem_1c)
 166 deflit(`FRAME',0)
 167
 168         movl    PARAM_CARRY, %edx
 169
 170         movl    PARAM_SIZE, %ecx
 171
 172         movl    %esi, SAVE_ESI
 173         movl    PARAM_SRC, %esi
 174
 175         movl    %ebp, SAVE_EBP
 176         movl    PARAM_DIVISOR, %ebp
 177
 178         movl    %edi, SAVE_EDI
 179         movl    PARAM_DST, %edi
 180
 181         movl    %ebx, SAVE_EBX
 182         movl    PARAM_XSIZE, %ebx
 183
 184         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 185         jmp     L(start_1c)
 186
 187 EPILOGUE()
 188
 189
 190         ALIGN(16)
 191 PROLOGUE(mpn_divrem_1)
 192 deflit(`FRAME',0)
 193
 194         movl    PARAM_SIZE, %ecx
 195         xorl    %edx, %edx              C initial carry (if can't skip a div)
 196
 197         movl    %esi, SAVE_ESI
 198         movl    PARAM_SRC, %esi
 199
 200         movl    %ebp, SAVE_EBP
 201         movl    PARAM_DIVISOR, %ebp
 202
 203         movl    %edi, SAVE_EDI
 204         movl    PARAM_DST, %edi
 205
 206         movl    %ebx, SAVE_EBX
 207         movl    PARAM_XSIZE, %ebx
 208         leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
 209
 210         orl     %ecx, %ecx              C size
 211         jz      L(no_skip_div)          C if size==0
 212         movl    -4(%esi,%ecx,4), %eax   C src high limb
 213
 214         cmpl    %ebp, %eax              C high cmp divisor
 215
 216         cmovnc( %eax, %edx)             C 0 if skip div, src high if not
 217         movl    %edx, (%edi,%ecx,4)     C dst high limb
 218
 219         movl    $0, %edx
 220         cmovc(  %eax, %edx)             C high is carry if high<divisor
 221
 222         sbbl    $0, %ecx                C size-1 if high<divisor
 223 L(no_skip_div):
 224
 225
 226 L(start_1c):
 227         C eax
 228         C ebx   xsize
 229         C ecx   size
 230         C edx   carry
 231         C esi   src
 232         C edi   &dst[xsize-1]
 233         C ebp   divisor
 234
 235         leal    (%ebx,%ecx), %eax       C size+xsize
 236         leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
 237         leal    (%edi,%ecx,4), %edi     C &dst[size+xsize-1]
 238
 239         cmpl    $MUL_THRESHOLD, %eax
 240         jae     L(mul_by_inverse)
 241
 242
 243         orl     %ecx, %ecx
 244         jz      L(divide_no_integer)    C if size==0
 245
 246 L(divide_integer):
 247         C eax   scratch (quotient)
 248         C ebx   xsize
 249         C ecx   counter
 250         C edx   carry
 251         C esi   src, decrementing
 252         C edi   dst, decrementing
 253         C ebp   divisor
 254
 255         movl    (%esi), %eax
 256         subl    $4, %esi
 257
 258         divl    %ebp
 259
 260         movl    %eax, (%edi)
 261         subl    $4, %edi
 262
 263         subl    $1, %ecx
 264         jnz     L(divide_integer)
 265
 266
 267 L(divide_no_integer):
 268         orl     %ebx, %ebx
 269         jnz     L(divide_fraction)      C if xsize!=0
 270
 271 L(divide_done):
 272         movl    SAVE_ESI, %esi
 273         movl    SAVE_EDI, %edi
 274         movl    SAVE_EBX, %ebx
 275         movl    SAVE_EBP, %ebp
 276         movl    %edx, %eax
 277         ret
 278
 279
 280 L(divide_fraction):
 281         C eax   scratch (quotient)
 282         C ebx   counter
 283         C ecx
 284         C edx   carry
 285         C esi
 286         C edi   dst, decrementing
 287         C ebp   divisor
 288
 289         movl    $0, %eax
 290
 291         divl    %ebp
 292
 293         movl    %eax, (%edi)
 294         subl    $4, %edi
 295
 296         subl    $1, %ebx
 297         jnz     L(divide_fraction)
 298
 299         jmp     L(divide_done)
 300
 301
 302
 303 C -----------------------------------------------------------------------------
 304
 305 L(mul_by_inverse):
 306         C eax
 307         C ebx   xsize
 308         C ecx   size
 309         C edx   carry
 310         C esi   &src[size-1]
 311         C edi   &dst[size+xsize-1]
 312         C ebp   divisor
 313
 314         bsrl    %ebp, %eax              C 31-l
 315         movd    %edx, %mm0              C carry
 316         movd    %edx, %mm1              C carry
 317         movl    %ecx, %edx              C size
 318         movl    $31, %ecx
 319
 320         C
 321
 322         xorl    %eax, %ecx              C l = leading zeros on d
 323         addl    $1, %eax
 324
 325         shll    %cl, %ebp               C d normalized
 326         movd    %ecx, %mm7              C l
 327         movl    %edx, %ecx              C size
 328
 329         movd    %eax, %mm6              C 32-l
 330         movl    $-1, %edx
 331         movl    $-1, %eax
 332
 333         C
 334
 335         subl    %ebp, %edx              C (b-d)-1 so  edx:eax = b*(b-d)-1
 336
 337         divl    %ebp                    C floor (b*(b-d)-1 / d)
 338         movd    %ebp, %mm5              C d
 339
 340         C
 341
 342         movd    %eax, %mm4              C m
 343
 344
 345 L(start_preinv):
 346         C eax   inverse
 347         C ebx   xsize
 348         C ecx   size
 349         C edx
 350         C esi   &src[size-1]
 351         C edi   &dst[size+xsize-1]
 352         C ebp
 353         C
 354         C mm0   carry
 355         C mm1   carry
 356         C mm2
 357         C mm4   m
 358         C mm5   d
 359         C mm6   31-l
 360         C mm7   l
 361
 362         psllq   %mm7, %mm0              C n2 = carry << l, for size==0
 363
 364         subl    $1, %ecx
 365         jb      L(integer_none)
 366
 367         movd    (%esi), %mm0            C src high limb
 368         punpckldq %mm1, %mm0
 369         psrlq   %mm6, %mm0              C n2 = high (carry:srchigh << l)
 370         jz      L(integer_last)
 371
 372
 373 C The dependent chain here consists of
 374 C
 375 C       2   paddd    n1+n2
 376 C       8   pmuludq  m*(n1+n2)
 377 C       2   paddq    n2:nadj + m*(n1+n2)
 378 C       2   psrlq    q1
 379 C       8   pmuludq  d*q1
 380 C       2   psubq    (n-d)-q1*d
 381 C       2   psrlq    high n-(q1+1)*d mask
 382 C       2   pand     d masked
 383 C       2   paddd    n2+d addback
 384 C       --
 385 C       30
 386 C
 387 C But it seems to run at 32 cycles, so presumably there's something else
 388 C going on.
 389
 390         ALIGN(16)
 391 L(integer_top):
 392         C eax
 393         C ebx
 394         C ecx   counter, size-1 to 0
 395         C edx
 396         C esi   src, decrementing
 397         C edi   dst, decrementing
 398         C
 399         C mm0   n2
 400         C mm4   m
 401         C mm5   d
 402         C mm6   32-l
 403         C mm7   l
 404
 405         ASSERT(b,`C n2<d
 406          movd   %mm0, %eax
 407          movd   %mm5, %edx
 408          cmpl   %edx, %eax')
 409
 410         movd    -4(%esi), %mm1          C next src limbs
 411         movd    (%esi), %mm2
 412         leal    -4(%esi), %esi
 413
 414         punpckldq %mm2, %mm1
 415         psrlq   %mm6, %mm1              C n10
 416
 417         movq    %mm1, %mm2              C n10
 418         movq    %mm1, %mm3              C n10
 419         psrad   $31, %mm1               C -n1
 420         pand    %mm5, %mm1              C -n1 & d
 421         paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
 422
 423         psrld   $31, %mm2               C n1
 424         paddd   %mm0, %mm2              C n2+n1
 425         punpckldq %mm0, %mm1            C n2:nadj
 426
 427         pmuludq %mm4, %mm2              C m*(n2+n1)
 428
 429         C
 430
 431         paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
 432         pxor    %mm2, %mm2              C break dependency, saves 4 cycles
 433         pcmpeqd %mm2, %mm2              C FF...FF
 434         psrlq   $63, %mm2               C 1
 435
 436         psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
 437
 438         paddd   %mm1, %mm2              C q1+1
 439         pmuludq %mm5, %mm1              C q1*d
 440
 441         punpckldq %mm0, %mm3            C n = n2:n10
 442         pxor    %mm0, %mm0
 443
 444         psubq   %mm5, %mm3              C n - d
 445
 446         C
 447
 448         psubq   %mm1, %mm3              C n - (q1+1)*d
 449
 450         por     %mm3, %mm0              C copy remainder -> new n2
 451         psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
 452
 453         ASSERT(be,`C 0 or -1
 454          movd   %mm3, %eax
 455          addl   $1, %eax
 456          cmpl   $1, %eax')
 457
 458         paddd   %mm3, %mm2              C q
 459         pand    %mm5, %mm3              C mask & d
 460
 461         paddd   %mm3, %mm0              C addback if necessary
 462         movd    %mm2, (%edi)
 463         leal    -4(%edi), %edi
 464
 465         subl    $1, %ecx
 466         ja      L(integer_top)
 467
 468
 469 L(integer_last):
 470         C eax
 471         C ebx   xsize
 472         C ecx
 473         C edx
 474         C esi   &src[0]
 475         C edi   &dst[xsize]
 476         C
 477         C mm0   n2
 478         C mm4   m
 479         C mm5   d
 480         C mm6
 481         C mm7   l
 482
 483         ASSERT(b,`C n2<d
 484          movd   %mm0, %eax
 485          movd   %mm5, %edx
 486          cmpl   %edx, %eax')
 487
 488         movd    (%esi), %mm1            C src[0]
 489         psllq   %mm7, %mm1              C n10
 490
 491         movq    %mm1, %mm2              C n10
 492         movq    %mm1, %mm3              C n10
 493         psrad   $31, %mm1               C -n1
 494         pand    %mm5, %mm1              C -n1 & d
 495         paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
 496
 497         psrld   $31, %mm2               C n1
 498         paddd   %mm0, %mm2              C n2+n1
 499         punpckldq %mm0, %mm1            C n2:nadj
 500
 501         pmuludq %mm4, %mm2              C m*(n2+n1)
 502
 503         C
 504
 505         paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
 506         pcmpeqd %mm2, %mm2              C FF...FF
 507         psrlq   $63, %mm2               C 1
 508
 509         psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
 510         paddd   %mm1, %mm2              C q1
 511
 512         pmuludq %mm5, %mm1              C q1*d
 513         punpckldq %mm0, %mm3            C n
 514         psubq   %mm5, %mm3              C n - d
 515         pxor    %mm0, %mm0
 516
 517         C
 518
 519         psubq   %mm1, %mm3              C n - (q1+1)*d
 520
 521         por     %mm3, %mm0              C remainder -> n2
 522         psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
 523
 524         ASSERT(be,`C 0 or -1
 525          movd   %mm3, %eax
 526          addl   $1, %eax
 527          cmpl   $1, %eax')
 528
 529         paddd   %mm3, %mm2              C q
 530         pand    %mm5, %mm3              C mask & d
 531
 532         paddd   %mm3, %mm0              C addback if necessary
 533         movd    %mm2, (%edi)
 534         leal    -4(%edi), %edi
 535
 536
 537 L(integer_none):
 538         C eax
 539         C ebx   xsize
 540
 541         orl     %ebx, %ebx
 542         jnz     L(fraction_some)        C if xsize!=0
 543
 544
 545 L(fraction_done):
 546         movl    SAVE_EBP, %ebp
 547         psrld   %mm7, %mm0              C remainder
 548
 549         movl    SAVE_EDI, %edi
 550         movd    %mm0, %eax
 551
 552         movl    SAVE_ESI, %esi
 553         movl    SAVE_EBX, %ebx
 554         emms
 555         ret
 556
 557
 558
 559 C -----------------------------------------------------------------------------
 560 C
 561
 562 L(fraction_some):
 563         C eax
 564         C ebx   xsize
 565         C ecx
 566         C edx
 567         C esi
 568         C edi   &dst[xsize-1]
 569         C ebp
 570
 571
 572 L(fraction_top):
 573         C eax
 574         C ebx   counter, xsize iterations
 575         C ecx
 576         C edx
 577         C esi   src, decrementing
 578         C edi   dst, decrementing
 579         C
 580         C mm0   n2
 581         C mm4   m
 582         C mm5   d
 583         C mm6   32-l
 584         C mm7   l
 585
 586         ASSERT(b,`C n2<d
 587          movd   %mm0, %eax
 588          movd   %mm5, %edx
 589          cmpl   %edx, %eax')
 590
 591         movq    %mm0, %mm1              C n2
 592         pmuludq %mm4, %mm0              C m*n2
 593
 594         pcmpeqd %mm2, %mm2
 595         psrlq   $63, %mm2
 596
 597         C
 598
 599         psrlq   $32, %mm0               C high(m*n2)
 600
 601         paddd   %mm1, %mm0              C q1 = high(n2:0 + m*n2)
 602
 603         paddd   %mm0, %mm2              C q1+1
 604         pmuludq %mm5, %mm0              C q1*d
 605
 606         psllq   $32, %mm1               C n = n2:0
 607         psubq   %mm5, %mm1              C n - d
 608
 609         C
 610
 611         psubq   %mm0, %mm1              C r = n - (q1+1)*d
 612         pxor    %mm0, %mm0
 613
 614         por     %mm1, %mm0              C r -> n2
 615         psrlq   $32, %mm1               C high n - (q1+1)*d, 0 or -1
 616
 617         ASSERT(be,`C 0 or -1
 618          movd   %mm1, %eax
 619          addl   $1, %eax
 620          cmpl   $1, %eax')
 621
 622         paddd   %mm1, %mm2              C q
 623         pand    %mm5, %mm1              C mask & d
 624
 625         paddd   %mm1, %mm0              C addback if necessary
 626         movd    %mm2, (%edi)
 627         leal    -4(%edi), %edi
 628
 629         subl    $1, %ebx
 630         jne     L(fraction_top)
 631
 632
 633         jmp     L(fraction_done)
 634
 635 EPILOGUE()