mpn/x86/k6/mmx/dive_1.asm

   1 dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C         divisor
  35 C       odd   even
  36 C K6:   10.0  12.0  cycles/limb
  37 C K6-2: 10.0  11.5
  38
  39
  40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  41 C                      mp_limb_t divisor);
  42 C
  43 C A simple divl is used for size==1.  This is about 10 cycles faster for an
  44 C odd divisor or 20 cycles for an even divisor.
  45 C
  46 C The loops are quite sensitive to code alignment, speeds should be
  47 C rechecked (odd and even divisor, pic and non-pic) if contemplating
  48 C changing anything.
  49
  50 defframe(PARAM_DIVISOR,16)
  51 defframe(PARAM_SIZE,   12)
  52 defframe(PARAM_SRC,    8)
  53 defframe(PARAM_DST,    4)
  54
  55 dnl  re-use parameter space
  56 define(VAR_INVERSE,`PARAM_DST')
  57
  58         TEXT
  59
  60         ALIGN(32)
  61 PROLOGUE(mpn_divexact_1)
  62 deflit(`FRAME',0)
  63
  64         movl    PARAM_SIZE, %ecx
  65
  66         movl    PARAM_SRC, %eax
  67         xorl    %edx, %edx
  68
  69         cmpl    $1, %ecx
  70         jnz     L(two_or_more)
  71
  72         movl    (%eax), %eax
  73
  74         divl    PARAM_DIVISOR
  75
  76         movl    PARAM_DST, %ecx
  77         movl    %eax, (%ecx)
  78
  79         ret
  80
  81
  82 L(two_or_more):
  83         movl    PARAM_DIVISOR, %eax
  84         pushl   %ebx            FRAME_pushl()
  85
  86         movl    PARAM_SRC, %ebx
  87         pushl   %ebp            FRAME_pushl()
  88
  89 L(strip_twos):
  90         shrl    %eax
  91         incl    %edx                    C will get shift+1
  92
  93         jnc     L(strip_twos)
  94         pushl   %esi            FRAME_pushl()
  95
  96         leal    1(%eax,%eax), %esi      C d without twos
  97         andl    $127, %eax              C d/2, 7 bits
  98
  99 ifdef(`PIC',`
 100         LEA(    binvert_limb_table, %ebp)
 101 Zdisp(  movzbl, 0,(%eax,%ebp), %eax)
 102 ',`
 103         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
 104 ')
 105         pushl   %edi            FRAME_pushl()
 106
 107         leal    (%eax,%eax), %ebp       C 2*inv
 108
 109         imull   %eax, %eax              C inv*inv
 110
 111         movl    PARAM_DST, %edi
 112
 113         imull   %esi, %eax              C inv*inv*d
 114
 115         subl    %eax, %ebp              C inv = 2*inv - inv*inv*d
 116         leal    (%ebp,%ebp), %eax       C 2*inv
 117
 118         imull   %ebp, %ebp              C inv*inv
 119
 120         movl    %esi, PARAM_DIVISOR     C d without twos
 121         leal    (%ebx,%ecx,4), %ebx     C src end
 122
 123         imull   %esi, %ebp              C inv*inv*d
 124
 125         leal    (%edi,%ecx,4), %edi     C dst end
 126         negl    %ecx                    C -size
 127
 128         subl    %ebp, %eax              C inv = 2*inv - inv*inv*d
 129         subl    $1, %edx                C shift amount, and clear carry
 130
 131         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 132         pushl   %eax    FRAME_pushl()
 133         imull   PARAM_DIVISOR, %eax
 134         cmpl    $1, %eax
 135         popl    %eax    FRAME_popl()')
 136
 137         movl    %eax, VAR_INVERSE
 138         jnz     L(even)
 139
 140         movl    (%ebx,%ecx,4), %esi     C src low limb
 141         jmp     L(odd_entry)
 142
 143
 144         ALIGN(16)
 145         nop     C code alignment
 146 L(odd_top):
 147         C eax   scratch
 148         C ebx   src end
 149         C ecx   counter, limbs, negative
 150         C edx   inverse
 151         C esi   next limb, adjusted for carry
 152         C edi   dst end
 153         C ebp   carry bit, 0 or -1
 154
 155         imull   %edx, %esi
 156
 157         movl    PARAM_DIVISOR, %eax
 158         movl    %esi, -4(%edi,%ecx,4)
 159
 160         mull    %esi                    C carry limb in edx
 161
 162         subl    %ebp, %edx              C apply carry bit
 163         movl    (%ebx,%ecx,4), %esi
 164
 165 L(odd_entry):
 166         subl    %edx, %esi              C apply carry limb
 167         movl    VAR_INVERSE, %edx
 168
 169         sbbl    %ebp, %ebp              C 0 or -1
 170
 171         incl    %ecx
 172         jnz     L(odd_top)
 173
 174
 175         imull   %edx, %esi
 176
 177         movl    %esi, -4(%edi,%ecx,4)
 178
 179         popl    %edi
 180         popl    %esi
 181
 182         popl    %ebp
 183         popl    %ebx
 184
 185         ret
 186
 187
 188 L(even):
 189         C eax
 190         C ebx   src end
 191         C ecx   -size
 192         C edx   twos
 193         C esi
 194         C edi   dst end
 195         C ebp
 196
 197         xorl    %ebp, %ebp
 198 Zdisp(  movq,   0,(%ebx,%ecx,4), %mm0)  C src[0,1]
 199
 200         movd    %edx, %mm7
 201         movl    VAR_INVERSE, %edx
 202
 203         addl    $2, %ecx
 204         psrlq   %mm7, %mm0
 205
 206         movd    %mm0, %esi
 207         jz      L(even_two)             C if only two limbs
 208
 209
 210 C Out-of-order execution is good enough to hide the load/rshift/movd
 211 C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
 212 C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
 213 C been found.  Maybe the fact every second movq is unaligned costs the extra
 214 C 0.5.
 215
 216 L(even_top):
 217         C eax   scratch
 218         C ebx   src end
 219         C ecx   counter, limbs, negative
 220         C edx   inverse
 221         C esi   next limb, adjusted for carry
 222         C edi   dst end
 223         C ebp   carry bit, 0 or -1
 224         C
 225         C mm0   scratch, source limbs
 226         C mm7   twos
 227
 228         imull   %edx, %esi
 229
 230         movl    %esi, -8(%edi,%ecx,4)
 231         movl    PARAM_DIVISOR, %eax
 232
 233         mull    %esi                    C carry limb in edx
 234
 235         movq    -4(%ebx,%ecx,4), %mm0
 236         psrlq   %mm7, %mm0
 237
 238         movd    %mm0, %esi
 239         subl    %ebp, %edx              C apply carry bit
 240
 241         subl    %edx, %esi              C apply carry limb
 242         movl    VAR_INVERSE, %edx
 243
 244         sbbl    %ebp, %ebp              C 0 or -1
 245
 246         incl    %ecx
 247         jnz     L(even_top)
 248
 249
 250 L(even_two):
 251         movd    -4(%ebx), %mm0          C src high limb
 252         psrlq   %mm7, %mm0
 253
 254         imull   %edx, %esi
 255
 256         movl    %esi, -8(%edi)
 257         movl    PARAM_DIVISOR, %eax
 258
 259         mull    %esi                    C carry limb in edx
 260
 261         movd    %mm0, %esi
 262         subl    %ebp, %edx              C apply carry bit
 263
 264         movl    VAR_INVERSE, %eax
 265         subl    %edx, %esi              C apply carry limb
 266
 267         imull   %eax, %esi
 268
 269         movl    %esi, -4(%edi)
 270
 271         popl    %edi
 272         popl    %esi
 273
 274         popl    %ebp
 275         popl    %ebx
 276
 277         emms_or_femms
 278
 279         ret
 280
 281 EPILOGUE()