mpn/x86/k7/mmx/lshift.asm

   1 dnl  AMD K7 mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K7: 1.21 cycles/limb (at 16 limbs/loop).
  24
  25
  26
  27 dnl  K7: UNROLL_COUNT cycles/limb
  28 dnl           4           1.51
  29 dnl           8           1.26
  30 dnl          16           1.21
  31 dnl          32           1.2
  32 dnl  Maximum possible with the current code is 64.
  33
  34 deflit(UNROLL_COUNT, 16)
  35
  36
  37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  38 C                       unsigned shift);
  39 C
  40 C Shift src,size left by shift many bits and store the result in dst,size.
  41 C Zeros are shifted in at the right.  The bits shifted out at the left are
  42 C the return value.
  43 C
  44 C The comments in mpn_rshift apply here too.
  45
  46 ifdef(`PIC',`
  47 deflit(UNROLL_THRESHOLD, 10)
  48 ',`
  49 deflit(UNROLL_THRESHOLD, 10)
  50 ')
  51
  52 defframe(PARAM_SHIFT,16)
  53 defframe(PARAM_SIZE, 12)
  54 defframe(PARAM_SRC,  8)
  55 defframe(PARAM_DST,  4)
  56
  57 defframe(SAVE_EDI, -4)
  58 defframe(SAVE_ESI, -8)
  59 defframe(SAVE_EBX, -12)
  60 deflit(SAVE_SIZE, 12)
  61
  62         TEXT
  63         ALIGN(32)
  64
  65 PROLOGUE(mpn_lshift)
  66 deflit(`FRAME',0)
  67
  68         movl    PARAM_SIZE, %eax
  69         movl    PARAM_SRC, %edx
  70         subl    $SAVE_SIZE, %esp
  71 deflit(`FRAME',SAVE_SIZE)
  72
  73         movl    PARAM_SHIFT, %ecx
  74         movl    %edi, SAVE_EDI
  75
  76         movl    PARAM_DST, %edi
  77         decl    %eax
  78         jnz     L(more_than_one_limb)
  79
  80         movl    (%edx), %edx
  81
  82         shldl(  %cl, %edx, %eax)        C eax was decremented to zero
  83
  84         shll    %cl, %edx
  85
  86         movl    %edx, (%edi)
  87         movl    SAVE_EDI, %edi
  88         addl    $SAVE_SIZE, %esp
  89
  90         ret
  91
  92
  93 C -----------------------------------------------------------------------------
  94 L(more_than_one_limb):
  95         C eax   size-1
  96         C ebx
  97         C ecx   shift
  98         C edx   src
  99         C esi
 100         C edi   dst
 101         C ebp
 102
 103         movd    PARAM_SHIFT, %mm6
 104         movd    (%edx,%eax,4), %mm5     C src high limb
 105         cmp     $UNROLL_THRESHOLD-1, %eax
 106
 107         jae     L(unroll)
 108         negl    %ecx
 109         movd    (%edx), %mm4            C src low limb
 110
 111         addl    $32, %ecx
 112
 113         movd    %ecx, %mm7
 114
 115 L(simple_top):
 116         C eax   loop counter, limbs
 117         C ebx
 118         C ecx
 119         C edx   src
 120         C esi
 121         C edi   dst
 122         C ebp
 123         C
 124         C mm0   scratch
 125         C mm4   src low limb
 126         C mm5   src high limb
 127         C mm6   shift
 128         C mm7   32-shift
 129
 130         movq    -4(%edx,%eax,4), %mm0
 131         decl    %eax
 132
 133         psrlq   %mm7, %mm0
 134
 135         movd    %mm0, 4(%edi,%eax,4)
 136         jnz     L(simple_top)
 137
 138
 139         psllq   %mm6, %mm5
 140         psllq   %mm6, %mm4
 141
 142         psrlq   $32, %mm5
 143         movd    %mm4, (%edi)            C dst low limb
 144
 145         movd    %mm5, %eax              C return value
 146
 147         movl    SAVE_EDI, %edi
 148         addl    $SAVE_SIZE, %esp
 149         emms
 150
 151         ret
 152
 153
 154 C -----------------------------------------------------------------------------
 155         ALIGN(16)
 156 L(unroll):
 157         C eax   size-1
 158         C ebx   (saved)
 159         C ecx   shift
 160         C edx   src
 161         C esi
 162         C edi   dst
 163         C ebp
 164         C
 165         C mm5   src high limb, for return value
 166         C mm6   lshift
 167
 168         movl    %esi, SAVE_ESI
 169         movl    %ebx, SAVE_EBX
 170         leal    -4(%edx,%eax,4), %edx   C &src[size-2]
 171
 172         testb   $4, %dl
 173         movq    (%edx), %mm1            C src high qword
 174
 175         jz      L(start_src_aligned)
 176
 177
 178         C src isn't aligned, process high limb (marked xxx) separately to
 179         C make it so
 180         C
 181         C  source    -4(edx,%eax,4)
 182         C                  |
 183         C  +-------+-------+-------+--
 184         C  |  xxx          |
 185         C  +-------+-------+-------+--
 186         C        0mod8   4mod8   0mod8
 187         C
 188         C  dest      -4(edi,%eax,4)
 189         C                  |
 190         C  +-------+-------+--
 191         C  |  xxx  |       |
 192         C  +-------+-------+--
 193
 194         psllq   %mm6, %mm1
 195         subl    $4, %edx
 196         movl    %eax, PARAM_SIZE        C size-1
 197
 198         psrlq   $32, %mm1
 199         decl    %eax                    C size-2 is new size-1
 200
 201         movd    %mm1, 4(%edi,%eax,4)
 202         movq    (%edx), %mm1            C new src high qword
 203 L(start_src_aligned):
 204
 205
 206         leal    -4(%edi,%eax,4), %edi   C &dst[size-2]
 207         psllq   %mm6, %mm5
 208
 209         testl   $4, %edi
 210         psrlq   $32, %mm5               C return value
 211
 212         jz      L(start_dst_aligned)
 213
 214
 215         C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
 216         C shift is 32 bits extra.  High limb of dst (marked xxx) handled
 217         C here separately.
 218         C
 219         C  source       %edx
 220         C  +-------+-------+--
 221         C  |      mm1      |
 222         C  +-------+-------+--
 223         C                0mod8   4mod8
 224         C
 225         C  dest         %edi
 226         C  +-------+-------+-------+--
 227         C  |  xxx  |
 228         C  +-------+-------+-------+--
 229         C        0mod8   4mod8   0mod8
 230
 231         movq    %mm1, %mm0
 232         psllq   %mm6, %mm1
 233         addl    $32, %ecx               C shift+32
 234
 235         psrlq   $32, %mm1
 236
 237         movd    %mm1, 4(%edi)
 238         movq    %mm0, %mm1
 239         subl    $4, %edi
 240
 241         movd    %ecx, %mm6              C new lshift
 242 L(start_dst_aligned):
 243
 244         decl    %eax                    C size-2, two last limbs handled at end
 245         movq    %mm1, %mm2              C copy of src high qword
 246         negl    %ecx
 247
 248         andl    $-2, %eax               C round size down to even
 249         addl    $64, %ecx
 250
 251         movl    %eax, %ebx
 252         negl    %eax
 253
 254         andl    $UNROLL_MASK, %eax
 255         decl    %ebx
 256
 257         shll    %eax
 258
 259         movd    %ecx, %mm7              C rshift = 64-lshift
 260
 261 ifdef(`PIC',`
 262         call    L(pic_calc)
 263 L(here):
 264 ',`
 265         leal    L(entry) (%eax,%eax,4), %esi
 266 ')
 267         shrl    $UNROLL_LOG2, %ebx      C loop counter
 268
 269         leal    ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
 270         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
 271         movl    PARAM_SIZE, %eax        C for use at end
 272         jmp     *%esi
 273
 274
 275 ifdef(`PIC',`
 276 L(pic_calc):
 277         C See mpn/x86/README about old gas bugs
 278         leal    (%eax,%eax,4), %esi
 279         addl    $L(entry)-L(here), %esi
 280         addl    (%esp), %esi
 281
 282         ret_internal
 283 ')
 284
 285
 286 C -----------------------------------------------------------------------------
 287         ALIGN(32)
 288 L(top):
 289         C eax   size (for use at end)
 290         C ebx   loop counter
 291         C ecx   rshift
 292         C edx   src
 293         C esi   computed jump
 294         C edi   dst
 295         C ebp
 296         C
 297         C mm0   scratch
 298         C mm1   \ carry (alternating, mm2 first)
 299         C mm2   /
 300         C mm6   lshift
 301         C mm7   rshift
 302         C
 303         C 10 code bytes/limb
 304         C
 305         C The two chunks differ in whether mm1 or mm2 hold the carry.
 306         C The computed jump puts the initial carry in both mm1 and mm2.
 307
 308 L(entry):
 309 deflit(CHUNK_COUNT, 4)
 310 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 311         deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
 312         deflit(`disp1', eval(disp0 - 8))
 313
 314 Zdisp(  movq,   disp0,(%edx), %mm0)
 315         psllq   %mm6, %mm2
 316
 317         movq    %mm0, %mm1
 318         psrlq   %mm7, %mm0
 319
 320         por     %mm2, %mm0
 321 Zdisp(  movq,   %mm0, disp0,(%edi))
 322
 323
 324 Zdisp(  movq,   disp1,(%edx), %mm0)
 325         psllq   %mm6, %mm1
 326
 327         movq    %mm0, %mm2
 328         psrlq   %mm7, %mm0
 329
 330         por     %mm1, %mm0
 331 Zdisp(  movq,   %mm0, disp1,(%edi))
 332 ')
 333
 334         subl    $UNROLL_BYTES, %edx
 335         subl    $UNROLL_BYTES, %edi
 336         decl    %ebx
 337
 338         jns     L(top)
 339
 340
 341
 342 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
 343
 344 L(end):
 345         testb   $1, %al
 346         movl    SAVE_EBX, %ebx
 347         psllq   %mm6, %mm2      C wanted left shifted in all cases below
 348
 349         movd    %mm5, %eax
 350
 351         movl    SAVE_ESI, %esi
 352         jz      L(end_even)
 353
 354
 355 L(end_odd):
 356
 357         C Size odd, destination was aligned.
 358         C
 359         C                 source        edx+8   edx+4
 360         C                 --+---------------+-------+
 361         C                   |      mm2      |       |
 362         C                 --+---------------+-------+
 363         C
 364         C dest                            edi
 365         C --+---------------+---------------+-------+
 366         C   |   written     |               |       |
 367         C --+---------------+---------------+-------+
 368         C
 369         C mm6 = shift
 370         C mm7 = ecx = 64-shift
 371
 372
 373         C Size odd, destination was unaligned.
 374         C
 375         C                 source        edx+8   edx+4
 376         C                 --+---------------+-------+
 377         C                   |      mm2      |       |
 378         C                 --+---------------+-------+
 379         C
 380         C         dest                            edi
 381         C         --+---------------+---------------+
 382         C           |   written     |               |
 383         C         --+---------------+---------------+
 384         C
 385         C mm6 = shift+32
 386         C mm7 = ecx = 64-(shift+32)
 387
 388
 389         C In both cases there's one extra limb of src to fetch and combine
 390         C with mm2 to make a qword at (%edi), and in the aligned case
 391         C there's an extra limb of dst to be formed from that extra src limb
 392         C left shifted.
 393
 394         movd    disp(4) (%edx), %mm0
 395         testb   $32, %cl
 396
 397         movq    %mm0, %mm1
 398         psllq   $32, %mm0
 399
 400         psrlq   %mm7, %mm0
 401         psllq   %mm6, %mm1
 402
 403         por     %mm2, %mm0
 404
 405         movq    %mm0, disp(0) (%edi)
 406         jz      L(end_odd_unaligned)
 407         movd    %mm1, disp(-4) (%edi)
 408 L(end_odd_unaligned):
 409
 410         movl    SAVE_EDI, %edi
 411         addl    $SAVE_SIZE, %esp
 412         emms
 413
 414         ret
 415
 416
 417 L(end_even):
 418
 419         C Size even, destination was aligned.
 420         C
 421         C                 source        edx+8
 422         C                 --+---------------+
 423         C                   |      mm2      |
 424         C                 --+---------------+
 425         C
 426         C dest                            edi
 427         C --+---------------+---------------+
 428         C   |   written     |               |
 429         C --+---------------+---------------+
 430         C
 431         C mm6 = shift
 432         C mm7 = ecx = 64-shift
 433
 434
 435         C Size even, destination was unaligned.
 436         C
 437         C               source          edx+8
 438         C                 --+---------------+
 439         C                   |      mm2      |
 440         C                 --+---------------+
 441         C
 442         C         dest                  edi+4
 443         C         --+---------------+-------+
 444         C           |    written    |       |
 445         C         --+---------------+-------+
 446         C
 447         C mm6 = shift+32
 448         C mm7 = ecx = 64-(shift+32)
 449
 450
 451         C The movq for the aligned case overwrites the movd for the
 452         C unaligned case.
 453
 454         movq    %mm2, %mm0
 455         psrlq   $32, %mm2
 456
 457         testb   $32, %cl
 458         movd    %mm2, disp(4) (%edi)
 459
 460         jz      L(end_even_unaligned)
 461         movq    %mm0, disp(0) (%edi)
 462 L(end_even_unaligned):
 463
 464         movl    SAVE_EDI, %edi
 465         addl    $SAVE_SIZE, %esp
 466         emms
 467
 468         ret
 469
 470 EPILOGUE()