mpn/x86/pentium/mmx/rshift.asm

   1 dnl  Intel P5 mpn_rshift -- mpn right shift.
   2
   3 dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P5: 1.75 cycles/limb.
  24
  25
  26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  27 C                       unsigned shift);
  28 C
  29 C Shift src,size right by shift many bits and store the result in dst,size.
  30 C Zeros are shifted in at the left.  Return the bits shifted out at the
  31 C right.
  32 C
  33 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
  34 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
  35 C
  36 C Full speed depends on source and destination being aligned.  Unaligned mmx
  37 C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
  38 C setups and finish-ups are done to ensure alignment for the loop.
  39 C
  40 C MMX shifts work out a bit faster even for the simple loop.
  41
  42 defframe(PARAM_SHIFT,16)
  43 defframe(PARAM_SIZE, 12)
  44 defframe(PARAM_SRC,  8)
  45 defframe(PARAM_DST,  4)
  46 deflit(`FRAME',0)
  47
  48 dnl  Minimum 5, because the unrolled loop can't handle less.
  49 deflit(UNROLL_THRESHOLD, 5)
  50
  51         TEXT
  52         ALIGN(8)
  53
  54 PROLOGUE(mpn_rshift)
  55
  56         pushl   %ebx
  57         pushl   %edi
  58 deflit(`FRAME',8)
  59
  60         movl    PARAM_SIZE, %eax
  61         movl    PARAM_DST, %edx
  62
  63         movl    PARAM_SRC, %ebx
  64         movl    PARAM_SHIFT, %ecx
  65
  66         cmp     $UNROLL_THRESHOLD, %eax
  67         jae     L(unroll)
  68
  69         decl    %eax
  70         movl    (%ebx), %edi            C src low limb
  71
  72         jnz     L(simple)
  73
  74         shrdl(  %cl, %edi, %eax)        C eax was decremented to zero
  75
  76         shrl    %cl, %edi
  77
  78         movl    %edi, (%edx)            C dst low limb
  79         popl    %edi                    C risk of data cache bank clash
  80
  81         popl    %ebx
  82
  83         ret
  84
  85
  86 C -----------------------------------------------------------------------------
  87         ALIGN(8)
  88 L(simple):
  89         C eax   size-1
  90         C ebx   src
  91         C ecx   shift
  92         C edx   dst
  93         C esi
  94         C edi
  95         C ebp
  96 deflit(`FRAME',8)
  97
  98         movd    (%ebx), %mm5            C src[0]
  99         leal    (%ebx,%eax,4), %ebx     C &src[size-1]
 100
 101         movd    %ecx, %mm6              C rshift
 102         leal    -4(%edx,%eax,4), %edx   C &dst[size-2]
 103
 104         psllq   $32, %mm5
 105         negl    %eax
 106
 107
 108 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
 109 C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
 110 C cycles and would be 8 in a simple loop.  Using mmx helps the return value
 111 C and last limb calculations too.
 112
 113 L(simple_top):
 114         C eax   counter, limbs, negative
 115         C ebx   &src[size-1]
 116         C ecx   return value
 117         C edx   &dst[size-2]
 118         C
 119         C mm0   scratch
 120         C mm5   return value
 121         C mm6   shift
 122
 123         movq    (%ebx,%eax,4), %mm0
 124         incl    %eax
 125
 126         psrlq   %mm6, %mm0
 127
 128         movd    %mm0, (%edx,%eax,4)
 129         jnz     L(simple_top)
 130
 131
 132         movd    (%ebx), %mm0
 133         psrlq   %mm6, %mm5              C return value
 134
 135         psrlq   %mm6, %mm0
 136         popl    %edi
 137
 138         movd    %mm5, %eax
 139         popl    %ebx
 140
 141         movd    %mm0, 4(%edx)
 142
 143         emms
 144
 145         ret
 146
 147
 148 C -----------------------------------------------------------------------------
 149         ALIGN(8)
 150 L(unroll):
 151         C eax   size
 152         C ebx   src
 153         C ecx   shift
 154         C edx   dst
 155         C esi
 156         C edi
 157         C ebp
 158 deflit(`FRAME',8)
 159
 160         movd    (%ebx), %mm5            C src[0]
 161         movl    $4, %edi
 162
 163         movd    %ecx, %mm6              C rshift
 164         testl   %edi, %ebx
 165
 166         psllq   $32, %mm5
 167         jz      L(start_src_aligned)
 168
 169
 170         C src isn't aligned, process low limb separately (marked xxx) and
 171         C step src and dst by one limb, making src aligned.
 172         C
 173         C source                  ebx
 174         C --+-------+-------+-------+
 175         C           |          xxx  |
 176         C --+-------+-------+-------+
 177         C         4mod8   0mod8   4mod8
 178         C
 179         C         dest            edx
 180         C         --+-------+-------+
 181         C           |       |  xxx  |
 182         C         --+-------+-------+
 183
 184         movq    (%ebx), %mm0            C unaligned load
 185
 186         psrlq   %mm6, %mm0
 187         addl    $4, %ebx
 188
 189         decl    %eax
 190
 191         movd    %mm0, (%edx)
 192         addl    $4, %edx
 193 L(start_src_aligned):
 194
 195
 196         movq    (%ebx), %mm1
 197         testl   %edi, %edx
 198
 199         psrlq   %mm6, %mm5              C retval
 200         jz      L(start_dst_aligned)
 201
 202         C dst isn't aligned, add 4 to make it so, and pretend the shift is
 203         C 32 bits extra.  Low limb of dst (marked xxx) handled here
 204         C separately.
 205         C
 206         C          source          ebx
 207         C          --+-------+-------+
 208         C            |      mm1      |
 209         C          --+-------+-------+
 210         C                  4mod8   0mod8
 211         C
 212         C  dest                    edx
 213         C  --+-------+-------+-------+
 214         C                    |  xxx  |
 215         C  --+-------+-------+-------+
 216         C          4mod8   0mod8   4mod8
 217
 218         movq    %mm1, %mm0
 219         addl    $32, %ecx               C new shift
 220
 221         psrlq   %mm6, %mm0
 222
 223         movd    %ecx, %mm6
 224
 225         movd    %mm0, (%edx)
 226         addl    $4, %edx
 227 L(start_dst_aligned):
 228
 229
 230         movq    8(%ebx), %mm3
 231         negl    %ecx
 232
 233         movq    %mm3, %mm2              C mm2 src qword
 234         addl    $64, %ecx
 235
 236         movd    %ecx, %mm7
 237         psrlq   %mm6, %mm1
 238
 239         leal    -12(%ebx,%eax,4), %ebx
 240         leal    -20(%edx,%eax,4), %edx
 241
 242         psllq   %mm7, %mm3
 243         subl    $7, %eax                C size-7
 244
 245         por     %mm1, %mm3              C mm3 ready to store
 246         negl    %eax                    C -(size-7)
 247
 248         jns     L(finish)
 249
 250
 251         C This loop is the important bit, the rest is just support.  Careful
 252         C instruction scheduling achieves the claimed 1.75 c/l.  The
 253         C relevant parts of the pairing rules are:
 254         C
 255         C - mmx loads and stores execute only in the U pipe
 256         C - only one mmx shift in a pair
 257         C - wait one cycle before storing an mmx register result
 258         C - the usual address generation interlock
 259         C
 260         C Two qword calculations are slightly interleaved.  The instructions
 261         C marked "C" belong to the second qword, and the "C prev" one is for
 262         C the second qword from the previous iteration.
 263
 264         ALIGN(8)
 265 L(unroll_loop):
 266         C eax   counter, limbs, negative
 267         C ebx   &src[size-12]
 268         C ecx
 269         C edx   &dst[size-12]
 270         C esi
 271         C edi
 272         C
 273         C mm0
 274         C mm1
 275         C mm2   src qword from -8(%ebx,%eax,4)
 276         C mm3   dst qword ready to store to -8(%edx,%eax,4)
 277         C
 278         C mm5   return value
 279         C mm6   rshift
 280         C mm7   lshift
 281
 282         movq    (%ebx,%eax,4), %mm0
 283         psrlq   %mm6, %mm2
 284
 285         movq    %mm0, %mm1
 286         psllq   %mm7, %mm0
 287
 288         movq    %mm3, -8(%edx,%eax,4)   C prev
 289         por     %mm2, %mm0
 290
 291         movq    8(%ebx,%eax,4), %mm3    C
 292         psrlq   %mm6, %mm1              C
 293
 294         movq    %mm0, (%edx,%eax,4)
 295         movq    %mm3, %mm2              C
 296
 297         psllq   %mm7, %mm3              C
 298         addl    $4, %eax
 299
 300         por     %mm1, %mm3              C
 301         js      L(unroll_loop)
 302
 303
 304 L(finish):
 305         C eax   0 to 3 representing respectively 3 to 0 limbs remaining
 306
 307         testb   $2, %al
 308
 309         jnz     L(finish_no_two)
 310
 311         movq    (%ebx,%eax,4), %mm0
 312         psrlq   %mm6, %mm2
 313
 314         movq    %mm0, %mm1
 315         psllq   %mm7, %mm0
 316
 317         movq    %mm3, -8(%edx,%eax,4)   C prev
 318         por     %mm2, %mm0
 319
 320         movq    %mm1, %mm2
 321         movq    %mm0, %mm3
 322
 323         addl    $2, %eax
 324 L(finish_no_two):
 325
 326
 327         C eax   2 or 3 representing respectively 1 or 0 limbs remaining
 328         C
 329         C mm2   src prev qword, from -8(%ebx,%eax,4)
 330         C mm3   dst qword, for -8(%edx,%eax,4)
 331
 332         testb   $1, %al
 333         popl    %edi
 334
 335         movd    %mm5, %eax      C retval
 336         jnz     L(finish_zero)
 337
 338
 339         C One extra limb, destination was aligned.
 340         C
 341         C source                ebx
 342         C +-------+---------------+--
 343         C |       |      mm2      |
 344         C +-------+---------------+--
 345         C
 346         C dest                                  edx
 347         C +-------+---------------+---------------+--
 348         C |       |               |      mm3      |
 349         C +-------+---------------+---------------+--
 350         C
 351         C mm6 = shift
 352         C mm7 = ecx = 64-shift
 353
 354
 355         C One extra limb, destination was unaligned.
 356         C
 357         C source                ebx
 358         C +-------+---------------+--
 359         C |       |      mm2      |
 360         C +-------+---------------+--
 361         C
 362         C dest                          edx
 363         C +---------------+---------------+--
 364         C |               |      mm3      |
 365         C +---------------+---------------+--
 366         C
 367         C mm6 = shift+32
 368         C mm7 = ecx = 64-(shift+32)
 369
 370
 371         C In both cases there's one extra limb of src to fetch and combine
 372         C with mm2 to make a qword at 8(%edx), and in the aligned case
 373         C there's a further extra limb of dst to be formed.
 374
 375
 376         movd    8(%ebx), %mm0
 377         psrlq   %mm6, %mm2
 378
 379         movq    %mm0, %mm1
 380         psllq   %mm7, %mm0
 381
 382         movq    %mm3, (%edx)
 383         por     %mm2, %mm0
 384
 385         psrlq   %mm6, %mm1
 386         andl    $32, %ecx
 387
 388         popl    %ebx
 389         jz      L(finish_one_unaligned)
 390
 391         C dst was aligned, must store one extra limb
 392         movd    %mm1, 16(%edx)
 393 L(finish_one_unaligned):
 394
 395         movq    %mm0, 8(%edx)
 396
 397         emms
 398
 399         ret
 400
 401
 402 L(finish_zero):
 403
 404         C No extra limbs, destination was aligned.
 405         C
 406         C source        ebx
 407         C +---------------+--
 408         C |      mm2      |
 409         C +---------------+--
 410         C
 411         C dest                        edx+4
 412         C +---------------+---------------+--
 413         C |               |      mm3      |
 414         C +---------------+---------------+--
 415         C
 416         C mm6 = shift
 417         C mm7 = ecx = 64-shift
 418
 419
 420         C No extra limbs, destination was unaligned.
 421         C
 422         C source        ebx
 423         C +---------------+--
 424         C |      mm2      |
 425         C +---------------+--
 426         C
 427         C dest                edx+4
 428         C +-------+---------------+--
 429         C |       |      mm3      |
 430         C +-------+---------------+--
 431         C
 432         C mm6 = shift+32
 433         C mm7 = 64-(shift+32)
 434
 435
 436         C The movd for the unaligned case is clearly the same data as the
 437         C movq for the aligned case, it's just a choice between whether one
 438         C or two limbs should be written.
 439
 440
 441         movq    %mm3, 4(%edx)
 442         psrlq   %mm6, %mm2
 443
 444         movd    %mm2, 12(%edx)
 445         andl    $32, %ecx
 446
 447         popl    %ebx
 448         jz      L(finish_zero_unaligned)
 449
 450         movq    %mm2, 12(%edx)
 451 L(finish_zero_unaligned):
 452
 453         emms
 454
 455         ret
 456
 457 EPILOGUE()