mpn/x86/k7/dive_1.asm

   1 dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
   2
   3 dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C          cycles/limb
  24 C Athlon:     11.0
  25 C Hammer:      9.0
  26
  27
  28 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  29 C                      mp_limb_t divisor);
  30 C
  31 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
  32 C achieved with no special effort.  The load and shrld latencies are hidden
  33 C by out of order execution.
  34 C
  35 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
  36
  37 defframe(PARAM_DIVISOR,16)
  38 defframe(PARAM_SIZE,   12)
  39 defframe(PARAM_SRC,    8)
  40 defframe(PARAM_DST,    4)
  41
  42 defframe(SAVE_EBX,     -4)
  43 defframe(SAVE_ESI,     -8)
  44 defframe(SAVE_EDI,    -12)
  45 defframe(SAVE_EBP,    -16)
  46 defframe(VAR_INVERSE, -20)
  47 defframe(VAR_DST_END, -24)
  48
  49 deflit(STACK_SPACE, 24)
  50
  51         TEXT
  52
  53         ALIGN(16)
  54 PROLOGUE(mpn_divexact_1)
  55 deflit(`FRAME',0)
  56
  57         movl    PARAM_DIVISOR, %eax
  58         subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
  59         movl    $-1, %ecx               C shift count
  60
  61         movl    %ebp, SAVE_EBP
  62         movl    PARAM_SIZE, %ebp
  63
  64         movl    %esi, SAVE_ESI
  65         movl    %edi, SAVE_EDI
  66
  67         C If there's usually only one or two trailing zero bits then this
  68         C should be faster than bsfl.
  69 L(strip_twos):
  70         incl    %ecx
  71         shrl    %eax
  72         jnc     L(strip_twos)
  73
  74         movl    %ebx, SAVE_EBX
  75         leal    1(%eax,%eax), %ebx      C d without twos
  76         andl    $127, %eax              C d/2, 7 bits
  77
  78 ifdef(`PIC',`
  79         LEA(    binvert_limb_table, %edx)
  80         movzbl  (%eax,%edx), %eax               C inv 8 bits
  81 ',`
  82         movzbl  binvert_limb_table(%eax), %eax  C inv 8 bits
  83 ')
  84
  85         leal    (%eax,%eax), %edx       C 2*inv
  86         movl    %ebx, PARAM_DIVISOR     C d without twos
  87
  88         imull   %eax, %eax              C inv*inv
  89
  90         movl    PARAM_SRC, %esi
  91         movl    PARAM_DST, %edi
  92
  93         imull   %ebx, %eax              C inv*inv*d
  94
  95         subl    %eax, %edx              C inv = 2*inv - inv*inv*d
  96         leal    (%edx,%edx), %eax       C 2*inv
  97
  98         imull   %edx, %edx              C inv*inv
  99
 100         leal    (%esi,%ebp,4), %esi     C src end
 101         leal    (%edi,%ebp,4), %edi     C dst end
 102         negl    %ebp                    C -size
 103
 104         imull   %ebx, %edx              C inv*inv*d
 105
 106         subl    %edx, %eax              C inv = 2*inv - inv*inv*d
 107
 108         ASSERT(e,`      C expect d*inv == 1 mod 2^GMP_LIMB_BITS
 109         pushl   %eax    FRAME_pushl()
 110         imull   PARAM_DIVISOR, %eax
 111         cmpl    $1, %eax
 112         popl    %eax    FRAME_popl()')
 113
 114         movl    %eax, VAR_INVERSE
 115         movl    (%esi,%ebp,4), %eax     C src[0]
 116
 117         incl    %ebp
 118         jz      L(one)
 119
 120         movl    (%esi,%ebp,4), %edx     C src[1]
 121
 122         shrdl(  %cl, %edx, %eax)
 123
 124         movl    %edi, VAR_DST_END
 125         xorl    %ebx, %ebx
 126         jmp     L(entry)
 127
 128         ALIGN(8)
 129 L(top):
 130         C eax   q
 131         C ebx   carry bit, 0 or 1
 132         C ecx   shift
 133         C edx
 134         C esi   src end
 135         C edi   dst end
 136         C ebp   counter, limbs, negative
 137
 138         mull    PARAM_DIVISOR           C carry limb in edx
 139
 140         movl    -4(%esi,%ebp,4), %eax
 141         movl    (%esi,%ebp,4), %edi
 142
 143         shrdl(  %cl, %edi, %eax)
 144
 145         subl    %ebx, %eax              C apply carry bit
 146         setc    %bl
 147         movl    VAR_DST_END, %edi
 148
 149         subl    %edx, %eax              C apply carry limb
 150         adcl    $0, %ebx
 151
 152 L(entry):
 153         imull   VAR_INVERSE, %eax
 154
 155         movl    %eax, -4(%edi,%ebp,4)
 156         incl    %ebp
 157         jnz     L(top)
 158
 159
 160         mull    PARAM_DIVISOR           C carry limb in edx
 161
 162         movl    -4(%esi), %eax          C src high limb
 163         shrl    %cl, %eax
 164         movl    SAVE_ESI, %esi
 165
 166         subl    %ebx, %eax              C apply carry bit
 167         movl    SAVE_EBX, %ebx
 168         movl    SAVE_EBP, %ebp
 169
 170         subl    %edx, %eax              C apply carry limb
 171
 172         imull   VAR_INVERSE, %eax
 173
 174         movl    %eax, -4(%edi)
 175         movl    SAVE_EDI, %edi
 176         addl    $STACK_SPACE, %esp
 177
 178         ret
 179
 180
 181 L(one):
 182         shrl    %cl, %eax
 183         movl    SAVE_ESI, %esi
 184         movl    SAVE_EBX, %ebx
 185
 186         imull   VAR_INVERSE, %eax
 187
 188         movl    SAVE_EBP, %ebp
 189         movl    %eax, -4(%edi)
 190
 191         movl    SAVE_EDI, %edi
 192         addl    $STACK_SPACE, %esp
 193
 194         ret
 195
 196 EPILOGUE()