mpn/x86/pentium/mmx/lshift.asm

   1 dnl  Intel P5 mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P5: 1.75 cycles/limb.
  24
  25
  26 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  27 C                       unsigned shift);
  28 C
  29 C Shift src,size left by shift many bits and store the result in dst,size.
  30 C Zeros are shifted in at the right.  Return the bits shifted out at the
  31 C left.
  32 C
  33 C The comments in mpn_rshift apply here too.
  34
  35 defframe(PARAM_SHIFT,16)
  36 defframe(PARAM_SIZE, 12)
  37 defframe(PARAM_SRC,  8)
  38 defframe(PARAM_DST,  4)
  39 deflit(`FRAME',0)
  40
  41 dnl  minimum 5, because the unrolled loop can't handle less
  42 deflit(UNROLL_THRESHOLD, 5)
  43
  44         TEXT
  45         ALIGN(8)
  46
  47 PROLOGUE(mpn_lshift)
  48
  49         pushl   %ebx
  50         pushl   %edi
  51 deflit(`FRAME',8)
  52
  53         movl    PARAM_SIZE, %eax
  54         movl    PARAM_DST, %edx
  55
  56         movl    PARAM_SRC, %ebx
  57         movl    PARAM_SHIFT, %ecx
  58
  59         cmp     $UNROLL_THRESHOLD, %eax
  60         jae     L(unroll)
  61
  62         movl    -4(%ebx,%eax,4), %edi   C src high limb
  63         decl    %eax
  64
  65         jnz     L(simple)
  66
  67         shldl(  %cl, %edi, %eax)        C eax was decremented to zero
  68
  69         shll    %cl, %edi
  70
  71         movl    %edi, (%edx)            C dst low limb
  72         popl    %edi                    C risk of data cache bank clash
  73
  74         popl    %ebx
  75
  76         ret
  77
  78
  79 C -----------------------------------------------------------------------------
  80 L(simple):
  81         C eax   size-1
  82         C ebx   src
  83         C ecx   shift
  84         C edx   dst
  85         C esi
  86         C edi
  87         C ebp
  88 deflit(`FRAME',8)
  89
  90         movd    (%ebx,%eax,4), %mm5     C src high limb
  91
  92         movd    %ecx, %mm6              C lshift
  93         negl    %ecx
  94
  95         psllq   %mm6, %mm5
  96         addl    $32, %ecx
  97
  98         movd    %ecx, %mm7
  99         psrlq   $32, %mm5               C retval
 100
 101
 102 L(simple_top):
 103         C eax   counter, limbs, negative
 104         C ebx   src
 105         C ecx
 106         C edx   dst
 107         C esi
 108         C edi
 109         C
 110         C mm0   scratch
 111         C mm5   return value
 112         C mm6   shift
 113         C mm7   32-shift
 114
 115         movq    -4(%ebx,%eax,4), %mm0
 116         decl    %eax
 117
 118         psrlq   %mm7, %mm0
 119
 120         C
 121
 122         movd    %mm0, 4(%edx,%eax,4)
 123         jnz     L(simple_top)
 124
 125
 126         movd    (%ebx), %mm0
 127
 128         movd    %mm5, %eax
 129         psllq   %mm6, %mm0
 130
 131         popl    %edi
 132         popl    %ebx
 133
 134         movd    %mm0, (%edx)
 135
 136         emms
 137
 138         ret
 139
 140
 141 C -----------------------------------------------------------------------------
 142         ALIGN(8)
 143 L(unroll):
 144         C eax   size
 145         C ebx   src
 146         C ecx   shift
 147         C edx   dst
 148         C esi
 149         C edi
 150         C ebp
 151 deflit(`FRAME',8)
 152
 153         movd    -4(%ebx,%eax,4), %mm5   C src high limb
 154         leal    (%ebx,%eax,4), %edi
 155
 156         movd    %ecx, %mm6              C lshift
 157         andl    $4, %edi
 158
 159         psllq   %mm6, %mm5
 160         jz      L(start_src_aligned)
 161
 162
 163         C src isn't aligned, process high limb separately (marked xxx) to
 164         C make it so.
 165         C
 166         C  source     -8(ebx,%eax,4)
 167         C                  |
 168         C  +-------+-------+-------+--
 169         C  |               |
 170         C  +-------+-------+-------+--
 171         C        0mod8   4mod8   0mod8
 172         C
 173         C  dest
 174         C     -4(edx,%eax,4)
 175         C          |
 176         C  +-------+-------+--
 177         C  |  xxx  |       |
 178         C  +-------+-------+--
 179
 180         movq    -8(%ebx,%eax,4), %mm0   C unaligned load
 181
 182         psllq   %mm6, %mm0
 183         decl    %eax
 184
 185         psrlq   $32, %mm0
 186
 187         C
 188
 189         movd    %mm0, (%edx,%eax,4)
 190 L(start_src_aligned):
 191
 192         movq    -8(%ebx,%eax,4), %mm1   C src high qword
 193         leal    (%edx,%eax,4), %edi
 194
 195         andl    $4, %edi
 196         psrlq   $32, %mm5               C return value
 197
 198         movq    -16(%ebx,%eax,4), %mm3  C src second highest qword
 199         jz      L(start_dst_aligned)
 200
 201         C dst isn't aligned, subtract 4 to make it so, and pretend the shift
 202         C is 32 bits extra.  High limb of dst (marked xxx) handled here
 203         C separately.
 204         C
 205         C  source     -8(ebx,%eax,4)
 206         C                  |
 207         C  +-------+-------+--
 208         C  |      mm1      |
 209         C  +-------+-------+--
 210         C                0mod8   4mod8
 211         C
 212         C  dest
 213         C     -4(edx,%eax,4)
 214         C          |
 215         C  +-------+-------+-------+--
 216         C  |  xxx  |               |
 217         C  +-------+-------+-------+--
 218         C        0mod8   4mod8   0mod8
 219
 220         movq    %mm1, %mm0
 221         addl    $32, %ecx               C new shift
 222
 223         psllq   %mm6, %mm0
 224
 225         movd    %ecx, %mm6
 226         psrlq   $32, %mm0
 227
 228         C wasted cycle here waiting for %mm0
 229
 230         movd    %mm0, -4(%edx,%eax,4)
 231         subl    $4, %edx
 232 L(start_dst_aligned):
 233
 234
 235         psllq   %mm6, %mm1
 236         negl    %ecx                    C -shift
 237
 238         addl    $64, %ecx               C 64-shift
 239         movq    %mm3, %mm2
 240
 241         movd    %ecx, %mm7
 242         subl    $8, %eax                C size-8
 243
 244         psrlq   %mm7, %mm3
 245
 246         por     %mm1, %mm3              C mm3 ready to store
 247         jc      L(finish)
 248
 249
 250         C The comments in mpn_rshift apply here too.
 251
 252         ALIGN(8)
 253 L(unroll_loop):
 254         C eax   counter, limbs
 255         C ebx   src
 256         C ecx
 257         C edx   dst
 258         C esi
 259         C edi
 260         C
 261         C mm0
 262         C mm1
 263         C mm2   src qword from 16(%ebx,%eax,4)
 264         C mm3   dst qword ready to store to 24(%edx,%eax,4)
 265         C
 266         C mm5   return value
 267         C mm6   lshift
 268         C mm7   rshift
 269
 270         movq    8(%ebx,%eax,4), %mm0
 271         psllq   %mm6, %mm2
 272
 273         movq    %mm0, %mm1
 274         psrlq   %mm7, %mm0
 275
 276         movq    %mm3, 24(%edx,%eax,4)   C prev
 277         por     %mm2, %mm0
 278
 279         movq    (%ebx,%eax,4), %mm3     C
 280         psllq   %mm6, %mm1              C
 281
 282         movq    %mm0, 16(%edx,%eax,4)
 283         movq    %mm3, %mm2              C
 284
 285         psrlq   %mm7, %mm3              C
 286         subl    $4, %eax
 287
 288         por     %mm1, %mm3              C
 289         jnc     L(unroll_loop)
 290
 291
 292
 293 L(finish):
 294         C eax   -4 to -1 representing respectively 0 to 3 limbs remaining
 295
 296         testb   $2, %al
 297
 298         jz      L(finish_no_two)
 299
 300         movq    8(%ebx,%eax,4), %mm0
 301         psllq   %mm6, %mm2
 302
 303         movq    %mm0, %mm1
 304         psrlq   %mm7, %mm0
 305
 306         movq    %mm3, 24(%edx,%eax,4)   C prev
 307         por     %mm2, %mm0
 308
 309         movq    %mm1, %mm2
 310         movq    %mm0, %mm3
 311
 312         subl    $2, %eax
 313 L(finish_no_two):
 314
 315
 316         C eax   -4 or -3 representing respectively 0 or 1 limbs remaining
 317         C
 318         C mm2   src prev qword, from 16(%ebx,%eax,4)
 319         C mm3   dst qword, for 24(%edx,%eax,4)
 320
 321         testb   $1, %al
 322         movd    %mm5, %eax      C retval
 323
 324         popl    %edi
 325         jz      L(finish_zero)
 326
 327
 328         C One extra src limb, destination was aligned.
 329         C
 330         C                 source                  ebx
 331         C                 --+---------------+-------+
 332         C                   |      mm2      |       |
 333         C                 --+---------------+-------+
 334         C
 335         C dest         edx+12           edx+4     edx
 336         C --+---------------+---------------+-------+
 337         C   |      mm3      |               |       |
 338         C --+---------------+---------------+-------+
 339         C
 340         C mm6 = shift
 341         C mm7 = ecx = 64-shift
 342
 343
 344         C One extra src limb, destination was unaligned.
 345         C
 346         C                 source                  ebx
 347         C                 --+---------------+-------+
 348         C                   |      mm2      |       |
 349         C                 --+---------------+-------+
 350         C
 351         C         dest         edx+12           edx+4
 352         C         --+---------------+---------------+
 353         C           |      mm3      |               |
 354         C         --+---------------+---------------+
 355         C
 356         C mm6 = shift+32
 357         C mm7 = ecx = 64-(shift+32)
 358
 359
 360         C In both cases there's one extra limb of src to fetch and combine
 361         C with mm2 to make a qword at 4(%edx), and in the aligned case
 362         C there's an extra limb of dst to be formed from that extra src limb
 363         C left shifted.
 364
 365
 366         movd    (%ebx), %mm0
 367         psllq   %mm6, %mm2
 368
 369         movq    %mm3, 12(%edx)
 370         psllq   $32, %mm0
 371
 372         movq    %mm0, %mm1
 373         psrlq   %mm7, %mm0
 374
 375         por     %mm2, %mm0
 376         psllq   %mm6, %mm1
 377
 378         movq    %mm0, 4(%edx)
 379         psrlq   $32, %mm1
 380
 381         andl    $32, %ecx
 382         popl    %ebx
 383
 384         jz      L(finish_one_unaligned)
 385
 386         movd    %mm1, (%edx)
 387 L(finish_one_unaligned):
 388
 389         emms
 390
 391         ret
 392
 393
 394 L(finish_zero):
 395
 396         C No extra src limbs, destination was aligned.
 397         C
 398         C                 source          ebx
 399         C                 --+---------------+
 400         C                   |      mm2      |
 401         C                 --+---------------+
 402         C
 403         C dest          edx+8             edx
 404         C --+---------------+---------------+
 405         C   |      mm3      |               |
 406         C --+---------------+---------------+
 407         C
 408         C mm6 = shift
 409         C mm7 = ecx = 64-shift
 410
 411
 412         C No extra src limbs, destination was unaligned.
 413         C
 414         C               source            ebx
 415         C                 --+---------------+
 416         C                   |      mm2      |
 417         C                 --+---------------+
 418         C
 419         C         dest          edx+8   edx+4
 420         C         --+---------------+-------+
 421         C           |      mm3      |       |
 422         C         --+---------------+-------+
 423         C
 424         C mm6 = shift+32
 425         C mm7 = ecx = 64-(shift+32)
 426
 427
 428         C The movd for the unaligned case writes the same data to 4(%edx)
 429         C that the movq does for the aligned case.
 430
 431
 432         movq    %mm3, 8(%edx)
 433         andl    $32, %ecx
 434
 435         psllq   %mm6, %mm2
 436         jz      L(finish_zero_unaligned)
 437
 438         movq    %mm2, (%edx)
 439 L(finish_zero_unaligned):
 440
 441         psrlq   $32, %mm2
 442         popl    %ebx
 443
 444         movd    %mm5, %eax      C retval
 445
 446         movd    %mm2, 4(%edx)
 447
 448         emms
 449
 450         ret
 451
 452 EPILOGUE()