mpn/x86/pentium/lshift.asm

   1 dnl  Intel Pentium mpn_lshift -- mpn left shift.
   2
   3 dnl  Copyright 1992, 1994, 1995, 1996, 1999, 2000, 2002 Free Software
   4 dnl  Foundation, Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23
  24 C         cycles/limb
  25 C P5,P54:    6.0
  26 C P55:       5.375
  27
  28
  29 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                       unsigned shift);
  31 C
  32 C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
  33 C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
  34
  35 defframe(PARAM_SHIFT,16)
  36 defframe(PARAM_SIZE, 12)
  37 defframe(PARAM_SRC,  8)
  38 defframe(PARAM_DST,  4)
  39
  40         TEXT
  41         ALIGN(8)
  42 PROLOGUE(mpn_lshift)
  43
  44         pushl   %edi
  45         pushl   %esi
  46         pushl   %ebx
  47         pushl   %ebp
  48 deflit(`FRAME',16)
  49
  50         movl    PARAM_DST,%edi
  51         movl    PARAM_SRC,%esi
  52         movl    PARAM_SIZE,%ebp
  53         movl    PARAM_SHIFT,%ecx
  54
  55 C We can use faster code for shift-by-1 under certain conditions.
  56         cmp     $1,%ecx
  57         jne     L(normal)
  58         leal    4(%esi),%eax
  59         cmpl    %edi,%eax
  60         jnc     L(special)              C jump if s_ptr + 1 >= res_ptr
  61         leal    (%esi,%ebp,4),%eax
  62         cmpl    %eax,%edi
  63         jnc     L(special)              C jump if res_ptr >= s_ptr + size
  64
  65 L(normal):
  66         leal    -4(%edi,%ebp,4),%edi
  67         leal    -4(%esi,%ebp,4),%esi
  68
  69         movl    (%esi),%edx
  70         subl    $4,%esi
  71         xorl    %eax,%eax
  72         shldl(  %cl, %edx, %eax)        C compute carry limb
  73         pushl   %eax                    C push carry limb onto stack
  74
  75         decl    %ebp
  76         pushl   %ebp
  77         shrl    $3,%ebp
  78         jz      L(end)
  79
  80         movl    (%edi),%eax             C fetch destination cache line
  81
  82         ALIGN(4)
  83 L(oop): movl    -28(%edi),%eax          C fetch destination cache line
  84         movl    %edx,%ebx
  85
  86         movl    (%esi),%eax
  87         movl    -4(%esi),%edx
  88         shldl(  %cl, %eax, %ebx)
  89         shldl(  %cl, %edx, %eax)
  90         movl    %ebx,(%edi)
  91         movl    %eax,-4(%edi)
  92
  93         movl    -8(%esi),%ebx
  94         movl    -12(%esi),%eax
  95         shldl(  %cl, %ebx, %edx)
  96         shldl(  %cl, %eax, %ebx)
  97         movl    %edx,-8(%edi)
  98         movl    %ebx,-12(%edi)
  99
 100         movl    -16(%esi),%edx
 101         movl    -20(%esi),%ebx
 102         shldl(  %cl, %edx, %eax)
 103         shldl(  %cl, %ebx, %edx)
 104         movl    %eax,-16(%edi)
 105         movl    %edx,-20(%edi)
 106
 107         movl    -24(%esi),%eax
 108         movl    -28(%esi),%edx
 109         shldl(  %cl, %eax, %ebx)
 110         shldl(  %cl, %edx, %eax)
 111         movl    %ebx,-24(%edi)
 112         movl    %eax,-28(%edi)
 113
 114         subl    $32,%esi
 115         subl    $32,%edi
 116         decl    %ebp
 117         jnz     L(oop)
 118
 119 L(end): popl    %ebp
 120         andl    $7,%ebp
 121         jz      L(end2)
 122 L(oop2):
 123         movl    (%esi),%eax
 124         shldl(  %cl,%eax,%edx)
 125         movl    %edx,(%edi)
 126         movl    %eax,%edx
 127         subl    $4,%esi
 128         subl    $4,%edi
 129         decl    %ebp
 130         jnz     L(oop2)
 131
 132 L(end2):
 133         shll    %cl,%edx                C compute least significant limb
 134         movl    %edx,(%edi)             C store it
 135
 136         popl    %eax                    C pop carry limb
 137
 138         popl    %ebp
 139         popl    %ebx
 140         popl    %esi
 141         popl    %edi
 142         ret
 143
 144
 145 C We loop from least significant end of the arrays, which is only
 146 C permissable if the source and destination don't overlap, since the
 147 C function is documented to work for overlapping source and destination.
 148
 149 L(special):
 150         movl    (%esi),%edx
 151         addl    $4,%esi
 152
 153         decl    %ebp
 154         pushl   %ebp
 155         shrl    $3,%ebp
 156
 157         addl    %edx,%edx
 158         incl    %ebp
 159         decl    %ebp
 160         jz      L(Lend)
 161
 162         movl    (%edi),%eax             C fetch destination cache line
 163
 164         ALIGN(4)
 165 L(Loop):
 166         movl    28(%edi),%eax           C fetch destination cache line
 167         movl    %edx,%ebx
 168
 169         movl    (%esi),%eax
 170         movl    4(%esi),%edx
 171         adcl    %eax,%eax
 172         movl    %ebx,(%edi)
 173         adcl    %edx,%edx
 174         movl    %eax,4(%edi)
 175
 176         movl    8(%esi),%ebx
 177         movl    12(%esi),%eax
 178         adcl    %ebx,%ebx
 179         movl    %edx,8(%edi)
 180         adcl    %eax,%eax
 181         movl    %ebx,12(%edi)
 182
 183         movl    16(%esi),%edx
 184         movl    20(%esi),%ebx
 185         adcl    %edx,%edx
 186         movl    %eax,16(%edi)
 187         adcl    %ebx,%ebx
 188         movl    %edx,20(%edi)
 189
 190         movl    24(%esi),%eax
 191         movl    28(%esi),%edx
 192         adcl    %eax,%eax
 193         movl    %ebx,24(%edi)
 194         adcl    %edx,%edx
 195         movl    %eax,28(%edi)
 196
 197         leal    32(%esi),%esi           C use leal not to clobber carry
 198         leal    32(%edi),%edi
 199         decl    %ebp
 200         jnz     L(Loop)
 201
 202 L(Lend):
 203         popl    %ebp
 204         sbbl    %eax,%eax               C save carry in %eax
 205         andl    $7,%ebp
 206         jz      L(Lend2)
 207         addl    %eax,%eax               C restore carry from eax
 208 L(Loop2):
 209         movl    %edx,%ebx
 210         movl    (%esi),%edx
 211         adcl    %edx,%edx
 212         movl    %ebx,(%edi)
 213
 214         leal    4(%esi),%esi            C use leal not to clobber carry
 215         leal    4(%edi),%edi
 216         decl    %ebp
 217         jnz     L(Loop2)
 218
 219         jmp     L(L1)
 220 L(Lend2):
 221         addl    %eax,%eax               C restore carry from eax
 222 L(L1):  movl    %edx,(%edi)             C store last limb
 223
 224         sbbl    %eax,%eax
 225         negl    %eax
 226
 227         popl    %ebp
 228         popl    %ebx
 229         popl    %esi
 230         popl    %edi
 231         ret
 232
 233 EPILOGUE()