mpn/x86/k6/mmx/lshift.asm

   1 dnl  AMD K6 mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6: 3.0 cycles/limb
  24
  25
  26 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  27 C                       unsigned shift);
  28 C
  29 C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
  30 C instructions.  This is despite every second fetch being unaligned.
  31
  32
  33 defframe(PARAM_SHIFT,16)
  34 defframe(PARAM_SIZE, 12)
  35 defframe(PARAM_SRC,  8)
  36 defframe(PARAM_DST,  4)
  37
  38         TEXT
  39         ALIGN(32)
  40
  41 PROLOGUE(mpn_lshift)
  42 deflit(`FRAME',0)
  43
  44         C The 1 limb case can be done without the push %ebx, but it's then
  45         C still the same speed.  The push is left as a free helping hand for
  46         C the two_or_more code.
  47
  48         movl    PARAM_SIZE, %eax
  49         pushl   %ebx                    FRAME_pushl()
  50
  51         movl    PARAM_SRC, %ebx
  52         decl    %eax
  53
  54         movl    PARAM_SHIFT, %ecx
  55         jnz     L(two_or_more)
  56
  57         movl    (%ebx), %edx            C src limb
  58         movl    PARAM_DST, %ebx
  59
  60         shldl(  %cl, %edx, %eax)        C return value
  61
  62         shll    %cl, %edx
  63
  64         movl    %edx, (%ebx)            C dst limb
  65         popl    %ebx
  66
  67         ret
  68
  69
  70         ALIGN(16)       C avoid offset 0x1f
  71         nop             C avoid bad cache line crossing
  72 L(two_or_more):
  73         C eax   size-1
  74         C ebx   src
  75         C ecx   shift
  76         C edx
  77
  78         movl    (%ebx,%eax,4), %edx     C src high limb
  79         negl    %ecx
  80
  81         movd    PARAM_SHIFT, %mm6
  82         addl    $32, %ecx               C 32-shift
  83
  84         shrl    %cl, %edx
  85
  86         movd    %ecx, %mm7
  87         movl    PARAM_DST, %ecx
  88
  89 L(top):
  90         C eax   counter, size-1 to 1
  91         C ebx   src
  92         C ecx   dst
  93         C edx   retval
  94         C
  95         C mm0   scratch
  96         C mm6   shift
  97         C mm7   32-shift
  98
  99         movq    -4(%ebx,%eax,4), %mm0
 100         decl    %eax
 101
 102         psrlq   %mm7, %mm0
 103
 104         movd    %mm0, 4(%ecx,%eax,4)
 105         jnz     L(top)
 106
 107
 108         movd    (%ebx), %mm0
 109         popl    %ebx
 110
 111         psllq   %mm6, %mm0
 112         movl    %edx, %eax
 113
 114         movd    %mm0, (%ecx)
 115
 116         emms
 117         ret
 118
 119 EPILOGUE()