mpn/x86/k6/k62mmx/rshift.asm

   1 dnl  AMD K6-2 mpn_rshift -- mpn right shift.
   2
   3 dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6-2: 1.75 cycles/limb
  24
  25
  26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
  27 C                       unsigned shift);
  28 C
  29
  30 defframe(PARAM_SHIFT,16)
  31 defframe(PARAM_SIZE, 12)
  32 defframe(PARAM_SRC,  8)
  33 defframe(PARAM_DST,  4)
  34 deflit(`FRAME',0)
  35
  36 dnl  Minimum 9, because the unrolled loop can't handle less.
  37 dnl
  38 deflit(UNROLL_THRESHOLD, 9)
  39
  40         TEXT
  41         ALIGN(32)
  42
  43 PROLOGUE(mpn_rshift)
  44 deflit(`FRAME',0)
  45
  46         C The 1 limb case can be done without the push %ebx, but it's then
  47         C still the same speed.  The push is left as a free helping hand for
  48         C the two_or_more code.
  49
  50         movl    PARAM_SIZE, %eax
  51         pushl   %ebx                    FRAME_pushl()
  52
  53         movl    PARAM_SRC, %ebx
  54         decl    %eax
  55
  56         movl    PARAM_SHIFT, %ecx
  57         jnz     L(two_or_more)
  58
  59         movl    (%ebx), %edx            C src limb
  60         movl    PARAM_DST, %ebx
  61
  62         shrdl(  %cl, %edx, %eax)        C return value
  63
  64         shrl    %cl, %edx
  65
  66         movl    %edx, (%ebx)            C dst limb
  67         popl    %ebx
  68
  69         ret
  70
  71
  72 C -----------------------------------------------------------------------------
  73         ALIGN(16)       C avoid offset 0x1f
  74 L(two_or_more):
  75         C eax   size-1
  76         C ebx   src
  77         C ecx   shift
  78         C edx
  79
  80         movl    (%ebx), %edx    C src low limb
  81         negl    %ecx
  82
  83         addl    $32, %ecx
  84         movd    PARAM_SHIFT, %mm6
  85
  86         shll    %cl, %edx
  87         cmpl    $UNROLL_THRESHOLD-1, %eax
  88
  89         jae     L(unroll)
  90
  91
  92         C eax   size-1
  93         C ebx   src
  94         C ecx   32-shift
  95         C edx   retval
  96         C
  97         C mm6   shift
  98
  99         movl    PARAM_DST, %ecx
 100         leal    (%ebx,%eax,4), %ebx
 101
 102         leal    -4(%ecx,%eax,4), %ecx
 103         negl    %eax
 104
 105         C This loop runs at about 3 cycles/limb, which is the amount of
 106         C decoding, and this is despite every second access being unaligned.
 107
 108 L(simple):
 109         C eax   counter, -(size-1) to -1
 110         C ebx   &src[size-1]
 111         C ecx   &dst[size-1]
 112         C edx   retval
 113         C
 114         C mm0   scratch
 115         C mm6   shift
 116
 117 Zdisp(  movq,   0,(%ebx,%eax,4), %mm0)
 118         incl    %eax
 119
 120         psrlq   %mm6, %mm0
 121
 122 Zdisp(  movd,   %mm0, 0,(%ecx,%eax,4))
 123         jnz     L(simple)
 124
 125
 126         movq    %mm0, (%ecx)
 127         movl    %edx, %eax
 128
 129         popl    %ebx
 130
 131         femms
 132         ret
 133
 134
 135 C -----------------------------------------------------------------------------
 136         ALIGN(16)
 137 L(unroll):
 138         C eax   size-1
 139         C ebx   src
 140         C ecx   32-shift
 141         C edx   retval
 142         C
 143         C mm6   shift
 144
 145         addl    $32, %ecx
 146         subl    $7, %eax                C size-8
 147
 148         movd    %ecx, %mm7
 149         movl    PARAM_DST, %ecx
 150
 151         movq    (%ebx), %mm2            C src low qword
 152         leal    (%ebx,%eax,4), %ebx     C src end - 32
 153
 154         testb   $4, %cl
 155         leal    (%ecx,%eax,4), %ecx     C dst end - 32
 156
 157         notl    %eax                    C -(size-7)
 158         jz      L(dst_aligned)
 159
 160         psrlq   %mm6, %mm2
 161         incl    %eax
 162
 163 Zdisp(  movd,   %mm2, 0,(%ecx,%eax,4))  C dst low limb
 164         movq    4(%ebx,%eax,4), %mm2    C new src low qword
 165 L(dst_aligned):
 166
 167         movq    12(%ebx,%eax,4), %mm0   C src second lowest qword
 168         nop     C avoid bad cache line crossing
 169
 170
 171         C This loop is the important bit, the rest is just support for it.
 172         C Four src limbs are held at the start, and four more will be read.
 173         C Four dst limbs will be written.  This schedule seems necessary for
 174         C full speed.
 175         C
 176         C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
 177         C and leaves 0 to 3 which can be tested with test $1 and $2.
 178
 179 L(top):
 180         C eax   counter, -(size-7) step by +4 until >=0
 181         C ebx   src end - 32
 182         C ecx   dst end - 32
 183         C edx   retval
 184         C
 185         C mm0   src next qword
 186         C mm1   scratch
 187         C mm2   src prev qword
 188         C mm6   shift
 189         C mm7   64-shift
 190
 191         psrlq   %mm6, %mm2
 192         addl    $4, %eax
 193
 194         movq    %mm0, %mm1
 195         psllq   %mm7, %mm0
 196
 197         por     %mm0, %mm2
 198         movq    4(%ebx,%eax,4), %mm0
 199
 200         psrlq   %mm6, %mm1
 201         movq    %mm2, -12(%ecx,%eax,4)
 202
 203         movq    %mm0, %mm2
 204         psllq   %mm7, %mm0
 205
 206         por     %mm0, %mm1
 207         movq    12(%ebx,%eax,4), %mm0
 208
 209         movq    %mm1, -4(%ecx,%eax,4)
 210         ja      L(top)          C jump if no carry and not zero
 211
 212
 213
 214         C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
 215         C to 3 representing respectively 3 to 0 further limbs.
 216
 217         testl   $2, %eax        C testl to avoid bad cache line crossings
 218         jnz     L(finish_nottwo)
 219
 220         C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
 221         C becomes new mm2 and a new mm0 is loaded.
 222
 223         psrlq   %mm6, %mm2
 224         movq    %mm0, %mm1
 225
 226         psllq   %mm7, %mm0
 227         addl    $2, %eax
 228
 229         por     %mm0, %mm2
 230         movq    12(%ebx,%eax,4), %mm0
 231
 232         movq    %mm2, -4(%ecx,%eax,4)
 233         movq    %mm1, %mm2
 234 L(finish_nottwo):
 235
 236
 237         testb   $1, %al
 238         psrlq   %mm6, %mm2
 239
 240         movq    %mm0, %mm1
 241         psllq   %mm7, %mm0
 242
 243         por     %mm0, %mm2
 244         psrlq   %mm6, %mm1
 245
 246         movq    %mm2, 4(%ecx,%eax,4)
 247         jnz     L(finish_even)
 248
 249
 250         C one further extra limb to process
 251
 252         movd    32-4(%ebx), %mm0        C src[size-1], most significant limb
 253         popl    %ebx
 254
 255         movq    %mm0, %mm2
 256         psllq   %mm7, %mm0
 257
 258         por     %mm0, %mm1
 259         psrlq   %mm6, %mm2
 260
 261         movq    %mm1, 32-12(%ecx)       C dst[size-3,size-2]
 262         movd    %mm2, 32-4(%ecx)        C dst[size-1]
 263
 264         movl    %edx, %eax              C retval
 265
 266         femms
 267         ret
 268
 269
 270         nop     C avoid bad cache line crossing
 271 L(finish_even):
 272         C no further extra limbs
 273
 274         movq    %mm1, 32-8(%ecx)        C dst[size-2,size-1]
 275         movl    %edx, %eax              C retval
 276
 277         popl    %ebx
 278
 279         femms
 280         ret
 281
 282 EPILOGUE()