mpn/x86/pentium/mode1o.asm

   1 dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
   2
   3 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P5: 23.0 cycles/limb
  24
  25
  26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
  27 C                               mp_limb_t divisor);
  28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  29 C                                mp_limb_t divisor, mp_limb_t carry);
  30 C
  31 C There seems no way to pair up the two lone instructions in the main loop.
  32 C
  33 C The special case for size==1 saves about 20 cycles (non-PIC), making it
  34 C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
  35 C all sizes.
  36 C
  37 C Alternatives:
  38 C
  39 C Using mmx for the multiplies might be possible, with pmullw and pmulhw
  40 C having just 3 cycle latencies, but carry bit handling would probably be
  41 C complicated.
  42
  43 defframe(PARAM_CARRY,  16)
  44 defframe(PARAM_DIVISOR,12)
  45 defframe(PARAM_SIZE,   8)
  46 defframe(PARAM_SRC,    4)
  47
  48 dnl  re-using parameter space
  49 define(VAR_INVERSE,`PARAM_SIZE')
  50
  51         TEXT
  52
  53         ALIGN(16)
  54 PROLOGUE(mpn_modexact_1c_odd)
  55 deflit(`FRAME',0)
  56
  57         movl    PARAM_DIVISOR, %eax
  58         movl    PARAM_CARRY, %edx
  59
  60         jmp     L(start_1c)
  61
  62 EPILOGUE()
  63
  64         ALIGN(16)
  65 PROLOGUE(mpn_modexact_1_odd)
  66 deflit(`FRAME',0)
  67
  68         movl    PARAM_DIVISOR, %eax
  69         xorl    %edx, %edx              C carry
  70
  71 L(start_1c):
  72
  73 ifdef(`PIC',`
  74         call    L(here)         FRAME_pushl()
  75 L(here):
  76
  77         shrl    %eax                    C d/2
  78         movl    (%esp), %ecx            C eip
  79
  80         addl    $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
  81         movl    %ebx, (%esp)            C push ebx
  82
  83         andl    $127, %eax
  84         movl    PARAM_SIZE, %ebx
  85
  86         movl    binvert_limb_table@GOT(%ecx), %ecx
  87         subl    $2, %ebx
  88
  89         movb    (%eax,%ecx), %cl                        C inv 8 bits
  90         jc      L(one_limb)
  91
  92 ',`
  93 dnl non-PIC
  94         shrl    %eax                    C d/2
  95         pushl   %ebx            FRAME_pushl()
  96
  97         movl    PARAM_SIZE, %ebx
  98         andl    $127, %eax
  99
 100         subl    $2, %ebx
 101         jc      L(one_limb)
 102
 103         movb    binvert_limb_table(%eax), %cl           C inv 8 bits
 104 ')
 105
 106         movl    %ecx, %eax
 107         addl    %ecx, %ecx              C 2*inv
 108
 109         imull   %eax, %eax              C inv*inv
 110
 111         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 112
 113         subl    %eax, %ecx              C inv = 2*inv - inv*inv*d
 114
 115         movl    %ecx, %eax
 116         addl    %ecx, %ecx              C 2*inv
 117
 118         imull   %eax, %eax              C inv*inv
 119
 120         imull   PARAM_DIVISOR, %eax     C inv*inv*d
 121
 122         subl    %eax, %ecx              C inv = 2*inv - inv*inv*d
 123         pushl   %esi            FRAME_pushl()
 124
 125         ASSERT(e,`      C d*inv == 1 mod 2^GMP_LIMB_BITS
 126         movl    %ecx, %eax
 127         imull   PARAM_DIVISOR, %eax
 128         cmpl    $1, %eax')
 129
 130         movl    PARAM_SRC, %esi
 131         movl    %ecx, VAR_INVERSE
 132
 133         movl    (%esi), %eax            C src[0]
 134         leal    4(%esi,%ebx,4), %esi    C &src[size-1]
 135
 136         xorl    $-1, %ebx               C -(size-1)
 137         ASSERT(nz)
 138         jmp     L(entry)
 139
 140
 141 C The use of VAR_INVERSE means only a store is needed for that value, rather
 142 C than a push and pop of say %edi.
 143
 144         ALIGN(16)
 145 L(top):
 146         C eax   scratch, low product
 147         C ebx   counter, limbs, negative
 148         C ecx   carry bit
 149         C edx   scratch, high product
 150         C esi   &src[size-1]
 151         C edi
 152         C ebp
 153
 154         mull    PARAM_DIVISOR           C h:dummy = q*d
 155
 156         movl    (%esi,%ebx,4), %eax     C src[i]
 157         subl    %ecx, %edx              C h -= -c
 158
 159 L(entry):
 160         subl    %edx, %eax              C s = src[i] - h
 161
 162         sbbl    %ecx, %ecx              C new -c (0 or -1)
 163
 164         imull   VAR_INVERSE, %eax       C q = s*i
 165
 166         incl    %ebx
 167         jnz     L(top)
 168
 169
 170         mull    PARAM_DIVISOR
 171
 172         movl    (%esi), %eax            C src high
 173         subl    %ecx, %edx              C h -= -c
 174
 175         cmpl    PARAM_DIVISOR, %eax
 176
 177         jbe     L(skip_last)
 178 deflit(FRAME_LAST,FRAME)
 179
 180
 181         subl    %edx, %eax              C s = src[i] - h
 182         popl    %esi            FRAME_popl()
 183
 184         sbbl    %ecx, %ecx              C c (0 or -1)
 185         popl    %ebx            FRAME_popl()
 186
 187         imull   VAR_INVERSE, %eax       C q = s*i
 188
 189         mull    PARAM_DIVISOR           C h:dummy = q*d
 190
 191         movl    %edx, %eax
 192
 193         subl    %ecx, %eax
 194
 195         ret
 196
 197
 198 C When high<divisor can skip last step.
 199
 200 L(skip_last):
 201 deflit(`FRAME',FRAME_LAST)
 202         C eax   src high
 203         C ebx
 204         C ecx
 205         C edx   r
 206         C esi
 207
 208         subl    %eax, %edx      C r-s
 209         popl    %esi            FRAME_popl()
 210
 211         sbbl    %eax, %eax      C -1 if underflow
 212         movl    PARAM_DIVISOR, %ebx
 213
 214         andl    %ebx, %eax      C divisor if underflow
 215         popl    %ebx            FRAME_popl()
 216
 217         addl    %edx, %eax      C addback if underflow
 218
 219         ret
 220
 221
 222 C Special case for size==1 using a division for r = c-a mod d.
 223 C Could look for a-c<d and save a division sometimes, but that doesn't seem
 224 C worth bothering about.
 225
 226 L(one_limb):
 227 deflit(`FRAME',4)
 228         C eax
 229         C ebx   size-2 (==-1)
 230         C ecx
 231         C edx   carry
 232         C esi   src end
 233         C edi
 234         C ebp
 235
 236         movl    %edx, %eax
 237         movl    PARAM_SRC, %edx
 238
 239         movl    PARAM_DIVISOR, %ecx
 240         popl    %ebx            FRAME_popl()
 241
 242         subl    (%edx), %eax            C c-a
 243
 244         sbbl    %edx, %edx
 245         decl    %ecx                    C d-1
 246
 247         andl    %ecx, %edx              C b*d+c-a if c<a, or c-a if c>=a
 248
 249         divl    PARAM_DIVISOR
 250
 251         movl    %edx, %eax
 252
 253         ret
 254
 255 EPILOGUE()