mpn/x86/k7/mmx/rshift.asm

   1 dnl  AMD K7 mpn_rshift -- mpn right shift.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K7: 1.21 cycles/limb (at 16 limbs/loop).
  24
  25
  26
  27 dnl  K7: UNROLL_COUNT cycles/limb
  28 dnl           4           1.51
  29 dnl           8           1.26
  30 dnl          16           1.21
  31 dnl          32           1.2
  32 dnl  Maximum possible with the current code is 64.
  33
  34 deflit(UNROLL_COUNT, 16)
  35
  36
  37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  38 C                       unsigned shift);
  39 C
  40 C Shift src,size right by shift many bits and store the result in dst,size.
  41 C Zeros are shifted in at the left.  The bits shifted out at the right are
  42 C the return value.
  43 C
  44 C This code uses 64-bit MMX operations, which makes it possible to handle
  45 C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
  46 C code, on the other hand, suffers from shrd being a vector path decode and
  47 C running at 3 cycles back-to-back.
  48 C
  49 C Full speed depends on source and destination being aligned, and some hairy
  50 C setups and finish-ups are done to arrange this for the loop.
  51
  52 ifdef(`PIC',`
  53 deflit(UNROLL_THRESHOLD, 10)
  54 ',`
  55 deflit(UNROLL_THRESHOLD, 10)
  56 ')
  57
  58 defframe(PARAM_SHIFT,16)
  59 defframe(PARAM_SIZE, 12)
  60 defframe(PARAM_SRC,  8)
  61 defframe(PARAM_DST,  4)
  62
  63 defframe(SAVE_EDI, -4)
  64 defframe(SAVE_ESI, -8)
  65 defframe(SAVE_EBX, -12)
  66 deflit(SAVE_SIZE, 12)
  67
  68         TEXT
  69         ALIGN(32)
  70
  71 PROLOGUE(mpn_rshift)
  72 deflit(`FRAME',0)
  73
  74         movl    PARAM_SIZE, %eax
  75         movl    PARAM_SRC, %edx
  76         subl    $SAVE_SIZE, %esp
  77 deflit(`FRAME',SAVE_SIZE)
  78
  79         movl    PARAM_SHIFT, %ecx
  80         movl    %edi, SAVE_EDI
  81
  82         movl    PARAM_DST, %edi
  83         decl    %eax
  84         jnz     L(more_than_one_limb)
  85
  86         movl    (%edx), %edx            C src limb
  87
  88         shrdl(  %cl, %edx, %eax)        C eax was decremented to zero
  89
  90         shrl    %cl, %edx
  91
  92         movl    %edx, (%edi)            C dst limb
  93         movl    SAVE_EDI, %edi
  94         addl    $SAVE_SIZE, %esp
  95
  96         ret
  97
  98
  99 C -----------------------------------------------------------------------------
 100 L(more_than_one_limb):
 101         C eax   size-1
 102         C ebx
 103         C ecx   shift
 104         C edx   src
 105         C esi
 106         C edi   dst
 107         C ebp
 108
 109         movd    PARAM_SHIFT, %mm6       C rshift
 110         movd    (%edx), %mm5            C src low limb
 111         cmp     $UNROLL_THRESHOLD-1, %eax
 112
 113         jae     L(unroll)
 114         leal    (%edx,%eax,4), %edx     C &src[size-1]
 115         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
 116
 117         movd    (%edx), %mm4            C src high limb
 118         negl    %eax
 119
 120
 121 L(simple_top):
 122         C eax   loop counter, limbs, negative
 123         C ebx
 124         C ecx   shift
 125         C edx   carry
 126         C edx   &src[size-1]
 127         C edi   &dst[size-2]
 128         C ebp
 129         C
 130         C mm0   scratch
 131         C mm4   src high limb
 132         C mm5   src low limb
 133         C mm6   shift
 134
 135         movq    (%edx,%eax,4), %mm0
 136         incl    %eax
 137
 138         psrlq   %mm6, %mm0
 139
 140         movd    %mm0, (%edi,%eax,4)
 141         jnz     L(simple_top)
 142
 143
 144         psllq   $32, %mm5
 145         psrlq   %mm6, %mm4
 146
 147         psrlq   %mm6, %mm5
 148         movd    %mm4, 4(%edi)           C dst high limb
 149
 150         movd    %mm5, %eax              C return value
 151
 152         movl    SAVE_EDI, %edi
 153         addl    $SAVE_SIZE, %esp
 154         emms
 155
 156         ret
 157
 158
 159 C -----------------------------------------------------------------------------
 160         ALIGN(16)
 161 L(unroll):
 162         C eax   size-1
 163         C ebx
 164         C ecx   shift
 165         C edx   src
 166         C esi
 167         C edi   dst
 168         C ebp
 169         C
 170         C mm5   src low limb
 171         C mm6   rshift
 172
 173         testb   $4, %dl
 174         movl    %esi, SAVE_ESI
 175         movl    %ebx, SAVE_EBX
 176
 177         psllq   $32, %mm5
 178         jz      L(start_src_aligned)
 179
 180
 181         C src isn't aligned, process low limb separately (marked xxx) and
 182         C step src and dst by one limb, making src aligned.
 183         C
 184         C source                  edx
 185         C --+-------+-------+-------+
 186         C           |          xxx  |
 187         C --+-------+-------+-------+
 188         C         4mod8   0mod8   4mod8
 189         C
 190         C         dest            edi
 191         C         --+-------+-------+
 192         C           |       |  xxx  |
 193         C         --+-------+-------+
 194
 195         movq    (%edx), %mm0            C src low two limbs
 196         addl    $4, %edx
 197         movl    %eax, PARAM_SIZE        C size-1
 198
 199         addl    $4, %edi
 200         decl    %eax                    C size-2 is new size-1
 201
 202         psrlq   %mm6, %mm0
 203         movl    %edi, PARAM_DST         C new dst
 204
 205         movd    %mm0, -4(%edi)
 206 L(start_src_aligned):
 207
 208
 209         movq    (%edx), %mm1            C src low two limbs
 210         decl    %eax                    C size-2, two last limbs handled at end
 211         testl   $4, %edi
 212
 213         psrlq   %mm6, %mm5
 214         jz      L(start_dst_aligned)
 215
 216
 217         C dst isn't aligned, add 4 to make it so, and pretend the shift is
 218         C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
 219         C
 220         C          source          edx
 221         C          --+-------+-------+
 222         C            |      mm1      |
 223         C          --+-------+-------+
 224         C                  4mod8   0mod8
 225         C
 226         C  dest                    edi
 227         C  --+-------+-------+-------+
 228         C                    |  xxx  |
 229         C  --+-------+-------+-------+
 230         C          4mod8   0mod8   4mod8
 231
 232         movq    %mm1, %mm0
 233         psrlq   %mm6, %mm1
 234         addl    $32, %ecx               C shift+32
 235
 236         movd    %mm1, (%edi)
 237         movq    %mm0, %mm1
 238         addl    $4, %edi                C new dst
 239
 240         movd    %ecx, %mm6
 241 L(start_dst_aligned):
 242
 243
 244         movq    %mm1, %mm2              C copy of src low two limbs
 245         negl    %ecx
 246         andl    $-2, %eax               C round size down to even
 247
 248         movl    %eax, %ebx
 249         negl    %eax
 250         addl    $64, %ecx
 251
 252         andl    $UNROLL_MASK, %eax
 253         decl    %ebx
 254
 255         shll    %eax
 256
 257         movd    %ecx, %mm7              C lshift = 64-rshift
 258
 259 ifdef(`PIC',`
 260         call    L(pic_calc)
 261 L(here):
 262 ',`
 263         leal    L(entry) (%eax,%eax,4), %esi
 264         negl    %eax
 265 ')
 266         shrl    $UNROLL_LOG2, %ebx      C loop counter
 267
 268         leal    ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
 269         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
 270         movl    PARAM_SIZE, %eax        C for use at end
 271
 272         jmp     *%esi
 273
 274
 275 ifdef(`PIC',`
 276 L(pic_calc):
 277         C See mpn/x86/README about old gas bugs
 278         leal    (%eax,%eax,4), %esi
 279         addl    $L(entry)-L(here), %esi
 280         addl    (%esp), %esi
 281         negl    %eax
 282
 283         ret_internal
 284 ')
 285
 286
 287 C -----------------------------------------------------------------------------
 288         ALIGN(64)
 289 L(top):
 290         C eax   size, for use at end
 291         C ebx   loop counter
 292         C ecx   lshift
 293         C edx   src
 294         C esi   was computed jump
 295         C edi   dst
 296         C ebp
 297         C
 298         C mm0   scratch
 299         C mm1   \ carry (alternating)
 300         C mm2   /
 301         C mm6   rshift
 302         C mm7   lshift
 303         C
 304         C 10 code bytes/limb
 305         C
 306         C The two chunks differ in whether mm1 or mm2 hold the carry.
 307         C The computed jump puts the initial carry in both mm1 and mm2.
 308
 309 L(entry):
 310 deflit(CHUNK_COUNT, 4)
 311 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 312         deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 313         deflit(`disp1', eval(disp0 + 8))
 314
 315 Zdisp(  movq,   disp0,(%edx), %mm0)
 316         psrlq   %mm6, %mm2
 317
 318         movq    %mm0, %mm1
 319         psllq   %mm7, %mm0
 320
 321         por     %mm2, %mm0
 322 Zdisp(  movq,   %mm0, disp0,(%edi))
 323
 324
 325 Zdisp(  movq,   disp1,(%edx), %mm0)
 326         psrlq   %mm6, %mm1
 327
 328         movq    %mm0, %mm2
 329         psllq   %mm7, %mm0
 330
 331         por     %mm1, %mm0
 332 Zdisp(  movq,   %mm0, disp1,(%edi))
 333 ')
 334
 335         addl    $UNROLL_BYTES, %edx
 336         addl    $UNROLL_BYTES, %edi
 337         decl    %ebx
 338
 339         jns     L(top)
 340
 341
 342 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
 343 deflit(`disp1', eval(disp0-0 + 8))
 344
 345         testb   $1, %al
 346         psrlq   %mm6, %mm2      C wanted rshifted in all cases below
 347         movl    SAVE_ESI, %esi
 348
 349         movd    %mm5, %eax              C return value
 350
 351         movl    SAVE_EBX, %ebx
 352         jz      L(end_even)
 353
 354
 355         C Size odd, destination was aligned.
 356         C
 357         C source
 358         C       edx
 359         C +-------+---------------+--
 360         C |       |      mm2      |
 361         C +-------+---------------+--
 362         C
 363         C dest                  edi
 364         C +-------+---------------+---------------+--
 365         C |       |               |    written    |
 366         C +-------+---------------+---------------+--
 367         C
 368         C mm6 = shift
 369         C mm7 = ecx = 64-shift
 370
 371
 372         C Size odd, destination was unaligned.
 373         C
 374         C source
 375         C       edx
 376         C +-------+---------------+--
 377         C |       |      mm2      |
 378         C +-------+---------------+--
 379         C
 380         C dest          edi
 381         C +---------------+---------------+--
 382         C |               |    written    |
 383         C +---------------+---------------+--
 384         C
 385         C mm6 = shift+32
 386         C mm7 = ecx = 64-(shift+32)
 387
 388
 389         C In both cases there's one extra limb of src to fetch and combine
 390         C with mm2 to make a qword to store, and in the aligned case there's
 391         C a further extra limb of dst to be formed.
 392
 393
 394         movd    disp0(%edx), %mm0
 395         movq    %mm0, %mm1
 396
 397         psllq   %mm7, %mm0
 398         testb   $32, %cl
 399
 400         por     %mm2, %mm0
 401         psrlq   %mm6, %mm1
 402
 403         movq    %mm0, disp0(%edi)
 404         jz      L(finish_odd_unaligned)
 405
 406         movd    %mm1, disp1(%edi)
 407 L(finish_odd_unaligned):
 408
 409         movl    SAVE_EDI, %edi
 410         addl    $SAVE_SIZE, %esp
 411         emms
 412
 413         ret
 414
 415
 416 L(end_even):
 417
 418         C Size even, destination was aligned.
 419         C
 420         C source
 421         C +---------------+--
 422         C |      mm2      |
 423         C +---------------+--
 424         C
 425         C dest          edi
 426         C +---------------+---------------+--
 427         C |               |      mm3      |
 428         C +---------------+---------------+--
 429         C
 430         C mm6 = shift
 431         C mm7 = ecx = 64-shift
 432
 433
 434         C Size even, destination was unaligned.
 435         C
 436         C source
 437         C +---------------+--
 438         C |      mm2      |
 439         C +---------------+--
 440         C
 441         C dest  edi
 442         C +-------+---------------+--
 443         C |       |      mm3      |
 444         C +-------+---------------+--
 445         C
 446         C mm6 = shift+32
 447         C mm7 = 64-(shift+32)
 448
 449
 450         C The movd for the unaligned case is the same data as the movq for
 451         C the aligned case, it's just a choice between whether one or two
 452         C limbs should be written.
 453
 454
 455         testb   $32, %cl
 456         movd    %mm2, disp0(%edi)
 457
 458         jz      L(end_even_unaligned)
 459
 460         movq    %mm2, disp0(%edi)
 461 L(end_even_unaligned):
 462
 463         movl    SAVE_EDI, %edi
 464         addl    $SAVE_SIZE, %esp
 465         emms
 466
 467         ret
 468
 469 EPILOGUE()