mpn/x86/k6/divrem_1.asm

   1 dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
   4 dnl  Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23
  24 C K6: 20 cycles/limb
  25
  26
  27 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
  28 C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
  29 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
  30 C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
  31 C                          mp_limb_t carry);
  32 C
  33 C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
  34 C instead of decl+jnz, since it comes out 2 cycles/limb faster.
  35 C
  36 C A test is done to see if the high limb is less than the divisor, and if so
  37 C one less div is done.  A div is 20 cycles, so assuming high<divisor about
  38 C half the time, then this test saves half that amount.  The branch
  39 C misprediction penalty is less than that.
  40 C
  41 C Back-to-back div instructions run at 20 cycles, the same as the loop here,
  42 C so it seems there's nothing to gain by rearranging the loop.  Pairing the
  43 C mov and loop instructions was found to gain nothing.
  44 C
  45 C Enhancements:
  46 C
  47 C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
  48 C that algorithm has been found to suffer from the relatively poor carry
  49 C handling on K6 and too many auxiliary instructions.  The fractional part
  50 C however could be done at about 13 c/l, if it mattered enough.
  51
  52 defframe(PARAM_CARRY,  24)
  53 defframe(PARAM_DIVISOR,20)
  54 defframe(PARAM_SIZE,   16)
  55 defframe(PARAM_SRC,    12)
  56 defframe(PARAM_XSIZE,  8)
  57 defframe(PARAM_DST,    4)
  58
  59         TEXT
  60
  61         ALIGN(32)
  62 PROLOGUE(mpn_divrem_1c)
  63 deflit(`FRAME',0)
  64
  65         movl    PARAM_SIZE, %ecx
  66         pushl   %edi            FRAME_pushl()
  67
  68         movl    PARAM_SRC, %edi
  69         pushl   %esi            FRAME_pushl()
  70
  71         movl    PARAM_DIVISOR, %esi
  72         pushl   %ebx            FRAME_pushl()
  73
  74         movl    PARAM_DST, %ebx
  75         pushl   %ebp            FRAME_pushl()
  76
  77         movl    PARAM_XSIZE, %ebp
  78         orl     %ecx, %ecx              C size
  79
  80         movl    PARAM_CARRY, %edx
  81         jz      L(fraction)             C if size==0
  82
  83         leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
  84         jmp     L(integer_top)
  85
  86 EPILOGUE()
  87
  88
  89         ALIGN(16)
  90 PROLOGUE(mpn_divrem_1)
  91 deflit(`FRAME',0)
  92
  93         movl    PARAM_SIZE, %ecx
  94         pushl   %edi            FRAME_pushl()
  95
  96         movl    PARAM_SRC, %edi
  97         pushl   %esi            FRAME_pushl()
  98
  99         movl    PARAM_DIVISOR, %esi
 100         orl     %ecx,%ecx               C size
 101
 102         jz      L(size_zero)
 103         pushl   %ebx            FRAME_pushl()
 104
 105         movl    -4(%edi,%ecx,4), %eax   C src high limb
 106         xorl    %edx, %edx
 107
 108         movl    PARAM_DST, %ebx
 109         pushl   %ebp            FRAME_pushl()
 110
 111         movl    PARAM_XSIZE, %ebp
 112         cmpl    %esi, %eax
 113
 114         leal    -4(%ebx,%ebp,4), %ebx   C dst one limb below integer part
 115         jae     L(integer_entry)
 116
 117
 118         C high<divisor, so high of dst is zero, and avoid one div
 119
 120         movl    %edx, (%ebx,%ecx,4)
 121         decl    %ecx
 122
 123         movl    %eax, %edx
 124         jz      L(fraction)
 125
 126
 127 L(integer_top):
 128         C eax   scratch (quotient)
 129         C ebx   dst+4*xsize-4
 130         C ecx   counter
 131         C edx   scratch (remainder)
 132         C esi   divisor
 133         C edi   src
 134         C ebp   xsize
 135
 136         movl    -4(%edi,%ecx,4), %eax
 137 L(integer_entry):
 138
 139         divl    %esi
 140
 141         movl    %eax, (%ebx,%ecx,4)
 142         loop    L(integer_top)
 143
 144
 145 L(fraction):
 146         orl     %ebp, %ecx
 147         jz      L(done)
 148
 149         movl    PARAM_DST, %ebx
 150
 151
 152 L(fraction_top):
 153         C eax   scratch (quotient)
 154         C ebx   dst
 155         C ecx   counter
 156         C edx   scratch (remainder)
 157         C esi   divisor
 158         C edi
 159         C ebp
 160
 161         xorl    %eax, %eax
 162
 163         divl    %esi
 164
 165         movl    %eax, -4(%ebx,%ecx,4)
 166         loop    L(fraction_top)
 167
 168
 169 L(done):
 170         popl    %ebp
 171         movl    %edx, %eax
 172         popl    %ebx
 173         popl    %esi
 174         popl    %edi
 175         ret
 176
 177
 178 L(size_zero):
 179 deflit(`FRAME',8)
 180         movl    PARAM_XSIZE, %ecx
 181         xorl    %eax, %eax
 182
 183         movl    PARAM_DST, %edi
 184
 185         cld     C better safe than sorry, see mpn/x86/README
 186
 187         rep
 188         stosl
 189
 190         popl    %esi
 191         popl    %edi
 192         ret
 193 EPILOGUE()