mpn/x86/k6/mmx/dive_1.asm

   1 dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C         divisor
  24 C       odd   even
  25 C K6:   10.0  12.0  cycles/limb
  26 C K6-2: 10.0  11.5
  27
  28
  29 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  30 C                      mp_limb_t divisor);
  31 C
  32 C A simple divl is used for size==1.  This is about 10 cycles faster for an
  33 C odd divisor or 20 cycles for an even divisor.
  34 C
  35 C The loops are quite sensitive to code alignment, speeds should be
  36 C rechecked (odd and even divisor, pic and non-pic) if contemplating
  37 C changing anything.
  38
  39 defframe(PARAM_DIVISOR,16)
  40 defframe(PARAM_SIZE,   12)
  41 defframe(PARAM_SRC,    8)
  42 defframe(PARAM_DST,    4)
  43
  44 dnl  re-use parameter space
  45 define(VAR_INVERSE,`PARAM_DST')
  46
  47         TEXT
  48
  49         ALIGN(32)
  50 PROLOGUE(mpn_divexact_1)
  51 deflit(`FRAME',0)
  52
  53         movl    PARAM_SIZE, %ecx
  54
  55         movl    PARAM_SRC, %eax
  56         xorl    %edx, %edx
  57
  58         cmpl    $1, %ecx
  59         jnz     L(two_or_more)
  60
  61         movl    (%eax), %eax
  62
  63         divl    PARAM_DIVISOR
  64
  65         movl    PARAM_DST, %ecx
  66         movl    %eax, (%ecx)
  67
  68         ret
  69
  70
  71 L(two_or_more):
  72         movl    PARAM_DIVISOR, %eax
  73         pushl   %ebx            FRAME_pushl()
  74
  75         movl    PARAM_SRC, %ebx
  76         pushl   %ebp            FRAME_pushl()
  77
  78 L(strip_twos):
  79         shrl    %eax
  80         incl    %edx                    C will get shift+1
  81
  82         jnc     L(strip_twos)
  83         pushl   %esi            FRAME_pushl()
  84
  85         leal    1(%eax,%eax), %esi      C d without twos
  86         andl    $127, %eax              C d/2, 7 bits
  87
  88 ifdef(`PIC',`
  89         LEA(    binvert_limb_table, %ebp)
  90 Zdisp(  movzbl, 0,(%eax,%ebp), %eax)
  91 ',`
  92         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
  93 ')
  94         pushl   %edi            FRAME_pushl()
  95
  96         leal    (%eax,%eax), %ebp       C 2*inv
  97
  98         imull   %eax, %eax              C inv*inv
  99
 100         movl    PARAM_DST, %edi
 101
 102         imull   %esi, %eax              C inv*inv*d
 103
 104         subl    %eax, %ebp              C inv = 2*inv - inv*inv*d
 105         leal    (%ebp,%ebp), %eax       C 2*inv
 106
 107         imull   %ebp, %ebp              C inv*inv
 108
 109         movl    %esi, PARAM_DIVISOR     C d without twos
 110         leal    (%ebx,%ecx,4), %ebx     C src end
 111
 112         imull   %esi, %ebp              C inv*inv*d
 113
 114         leal    (%edi,%ecx,4), %edi     C dst end
 115         negl    %ecx                    C -size
 116
 117         subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
 118         subl    $1, %edx                C shift amount, and clear carry
 119
 120         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 121         pushl   %eax    FRAME_pushl()
 122         imull   PARAM_DIVISOR, %eax
 123         cmpl    $1, %eax
 124         popl    %eax    FRAME_popl()')
 125
 126         movl    %eax, VAR_INVERSE
 127         jnz     L(even)
 128
 129         movl    (%ebx,%ecx,4), %esi     C src low limb
 130         jmp     L(odd_entry)
 131
 132
 133         ALIGN(16)
 134         nop     C code alignment
 135 L(odd_top):
 136         C eax   scratch
 137         C ebx   src end
 138         C ecx   counter, limbs, negative
 139         C edx   inverse
 140         C esi   next limb, adjusted for carry
 141         C edi   dst end
 142         C ebp   carry bit, 0 or -1
 143
 144         imull   %edx, %esi
 145
 146         movl    PARAM_DIVISOR, %eax
 147         movl    %esi, -4(%edi,%ecx,4)
 148
 149         mull    %esi                    C carry limb in edx
 150
 151         subl    %ebp, %edx              C apply carry bit
 152         movl    (%ebx,%ecx,4), %esi
 153
 154 L(odd_entry):
 155         subl    %edx, %esi              C apply carry limb
 156         movl    VAR_INVERSE, %edx
 157
 158         sbbl    %ebp, %ebp              C 0 or -1
 159
 160         incl    %ecx
 161         jnz     L(odd_top)
 162
 163
 164         imull   %edx, %esi
 165
 166         movl    %esi, -4(%edi,%ecx,4)
 167
 168         popl    %edi
 169         popl    %esi
 170
 171         popl    %ebp
 172         popl    %ebx
 173
 174         ret
 175
 176
 177 L(even):
 178         C eax
 179         C ebx   src end
 180         C ecx   -size
 181         C edx   twos
 182         C esi
 183         C edi   dst end
 184         C ebp
 185
 186         xorl    %ebp, %ebp
 187 Zdisp(  movq,   0,(%ebx,%ecx,4), %mm0)  C src[0,1]
 188
 189         movd    %edx, %mm7
 190         movl    VAR_INVERSE, %edx
 191
 192         addl    $2, %ecx
 193         psrlq   %mm7, %mm0
 194
 195         movd    %mm0, %esi
 196         jz      L(even_two)             C if only two limbs
 197
 198
 199 C Out-of-order execution is good enough to hide the load/rshift/movd
 200 C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
 201 C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
 202 C been found.  Maybe the fact every second movq is unaligned costs the extra
 203 C 0.5.
 204
 205 L(even_top):
 206         C eax   scratch
 207         C ebx   src end
 208         C ecx   counter, limbs, negative
 209         C edx   inverse
 210         C esi   next limb, adjusted for carry
 211         C edi   dst end
 212         C ebp   carry bit, 0 or -1
 213         C
 214         C mm0   scratch, source limbs
 215         C mm7   twos
 216
 217         imull   %edx, %esi
 218
 219         movl    %esi, -8(%edi,%ecx,4)
 220         movl    PARAM_DIVISOR, %eax
 221
 222         mull    %esi                    C carry limb in edx
 223
 224         movq    -4(%ebx,%ecx,4), %mm0
 225         psrlq   %mm7, %mm0
 226
 227         movd    %mm0, %esi
 228         subl    %ebp, %edx              C apply carry bit
 229
 230         subl    %edx, %esi              C apply carry limb
 231         movl    VAR_INVERSE, %edx
 232
 233         sbbl    %ebp, %ebp              C 0 or -1
 234
 235         incl    %ecx
 236         jnz     L(even_top)
 237
 238
 239 L(even_two):
 240         movd    -4(%ebx), %mm0          C src high limb
 241         psrlq   %mm7, %mm0
 242
 243         imull   %edx, %esi
 244
 245         movl    %esi, -8(%edi)
 246         movl    PARAM_DIVISOR, %eax
 247
 248         mull    %esi                    C carry limb in edx
 249
 250         movd    %mm0, %esi
 251         subl    %ebp, %edx              C apply carry bit
 252
 253         movl    VAR_INVERSE, %eax
 254         subl    %edx, %esi              C apply carry limb
 255
 256         imull   %eax, %esi
 257
 258         movl    %esi, -4(%edi)
 259
 260         popl    %edi
 261         popl    %esi
 262
 263         popl    %ebp
 264         popl    %ebx
 265
 266         emms_or_femms
 267
 268         ret
 269
 270 EPILOGUE()