mpn/x86/k6/gcd_1.asm

   1 dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
   2
   3 dnl  Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C K6: 9.5 cycles/bit (approx)   1x1 gcd
  24 C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
  25
  26
  27 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
  28 C
  29 C This code is nothing very special, but offers a speedup over what gcc 2.95
  30 C can do with mpn/generic/gcd_1.c.
  31 C
  32 C Future:
  33 C
  34 C Using a lookup table to count trailing zeros seems a touch quicker, but
  35 C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
  36 C it too.
  37
  38
  39 dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
  40 dnl  bigger than y, then a division x%y is done to reduce it.
  41 dnl
  42 dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
  43 dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
  44 dnl  what's found.
  45
  46 deflit(DIV_THRESHOLD, 5)
  47
  48
  49 defframe(PARAM_LIMB, 12)
  50 defframe(PARAM_SIZE,  8)
  51 defframe(PARAM_SRC,   4)
  52
  53         TEXT
  54         ALIGN(16)
  55
  56 PROLOGUE(mpn_gcd_1)
  57 deflit(`FRAME',0)
  58
  59         ASSERT(ne, `cmpl $0, PARAM_LIMB')
  60         ASSERT(ae, `cmpl $1, PARAM_SIZE')
  61
  62
  63         movl    PARAM_SRC, %eax
  64         pushl   %ebx                    FRAME_pushl()
  65
  66         movl    PARAM_LIMB, %edx
  67         movl    $-1, %ecx
  68
  69         movl    (%eax), %ebx            C src low limb
  70
  71         movl    %ebx, %eax              C src low limb
  72         orl     %edx, %ebx
  73
  74 L(common_twos):
  75         shrl    %ebx
  76         incl    %ecx
  77
  78         jnc     L(common_twos)          C 1/4 chance on random data
  79         shrl    %cl, %edx               C y
  80
  81         cmpl    $1, PARAM_SIZE
  82         ja      L(size_two_or_more)
  83
  84
  85         ASSERT(nz, `orl %eax, %eax')    C should have src limb != 0
  86
  87         shrl    %cl, %eax               C x
  88
  89
  90         C Swap if necessary to make x>=y.  Measures a touch quicker as a
  91         C jump than a branch free calculation.
  92         C
  93         C eax   x
  94         C ebx
  95         C ecx   common twos
  96         C edx   y
  97
  98         movl    %eax, %ebx
  99         cmpl    %eax, %edx
 100
 101         jb      L(noswap)
 102         movl    %edx, %eax
 103
 104         movl    %ebx, %edx
 105         movl    %eax, %ebx
 106 L(noswap):
 107
 108
 109         C See if it's worth reducing x with a divl.
 110         C
 111         C eax   x
 112         C ebx   x
 113         C ecx   common twos
 114         C edx   y
 115
 116         shrl    $DIV_THRESHOLD, %ebx
 117
 118         cmpl    %ebx, %edx
 119         ja      L(nodiv)
 120
 121
 122         C Reduce x to x%y.
 123         C
 124         C eax   x
 125         C ebx
 126         C ecx   common twos
 127         C edx   y
 128
 129         movl    %edx, %ebx
 130         xorl    %edx, %edx
 131
 132         divl    %ebx
 133
 134         orl     %edx, %edx      C y
 135         nop     C code alignment
 136
 137         movl    %ebx, %eax      C x
 138         jz      L(done_shll)
 139 L(nodiv):
 140
 141
 142         C eax   x
 143         C ebx
 144         C ecx   common twos
 145         C edx   y
 146         C esi
 147         C edi
 148         C ebp
 149
 150 L(strip_y):
 151         shrl    %edx
 152         jnc     L(strip_y)
 153
 154         leal    1(%edx,%edx), %edx
 155         movl    %ecx, %ebx      C common twos
 156
 157         leal    1(%eax), %ecx
 158         jmp     L(strip_x_and)
 159
 160
 161 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
 162 C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
 163 C only 1/4.
 164 C
 165 C A second computed %cl shift was tried, but that measured a touch slower
 166 C than branching back.
 167 C
 168 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
 169 C measured about 1 cycle/bit slower.
 170
 171         C eax   x
 172         C ebx   common twos
 173         C ecx   scratch
 174         C edx   y
 175
 176         ALIGN(4)
 177 L(swap):
 178         addl    %eax, %edx      C x-y+y = x
 179         negl    %eax            C -(x-y) = y-x
 180
 181 L(strip_x):
 182         shrl    %eax            C odd-odd = even, so always one to strip
 183         ASSERT(nz)
 184
 185 L(strip_x_leal):
 186         leal    1(%eax), %ecx
 187
 188 L(strip_x_and):
 189         andl    $1, %ecx        C (x^1)&1
 190
 191         shrl    %cl, %eax       C shift if x even
 192
 193         testb   $1, %al
 194         jz      L(strip_x)
 195
 196         ASSERT(nz,`testl $1, %eax')     C x, y odd
 197         ASSERT(nz,`testl $1, %edx')
 198
 199         subl    %edx, %eax
 200         jb      L(swap)
 201         ja      L(strip_x)
 202
 203
 204         movl    %edx, %eax
 205         movl    %ebx, %ecx
 206
 207 L(done_shll):
 208         shll    %cl, %eax
 209         popl    %ebx
 210
 211         ret
 212
 213
 214 C -----------------------------------------------------------------------------
 215 C Two or more limbs.
 216 C
 217 C x={src,size} is reduced modulo y using either a plain mod_1 style
 218 C remainder, or a modexact_1 style exact division.
 219
 220 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
 221
 222         ALIGN(8)
 223 L(size_two_or_more):
 224         C eax
 225         C ebx
 226         C ecx   common twos
 227         C edx   y, without common twos
 228         C esi
 229         C edi
 230         C ebp
 231
 232 deflit(FRAME_TWO_OR_MORE, FRAME)
 233
 234         pushl   %edi            defframe_pushl(SAVE_EDI)
 235         movl    PARAM_SRC, %ebx
 236
 237 L(y_twos):
 238         shrl    %edx
 239         jnc     L(y_twos)
 240
 241         movl    %ecx, %edi              C common twos
 242         movl    PARAM_SIZE, %ecx
 243
 244         pushl   %esi            defframe_pushl(SAVE_ESI)
 245         leal    1(%edx,%edx), %esi      C y (odd)
 246
 247         movl    -4(%ebx,%ecx,4), %eax   C src high limb
 248
 249         cmpl    %edx, %eax              C carry if high<divisor
 250
 251         sbbl    %edx, %edx              C -1 if high<divisor
 252
 253         addl    %edx, %ecx              C skip one limb if high<divisor
 254         andl    %eax, %edx
 255
 256         cmpl    $MODEXACT_THRESHOLD, %ecx
 257         jae     L(modexact)
 258
 259
 260 L(divide_top):
 261         C eax   scratch (quotient)
 262         C ebx   src
 263         C ecx   counter, size-1 to 1
 264         C edx   carry (remainder)
 265         C esi   divisor (odd)
 266         C edi
 267         C ebp
 268
 269         movl    -4(%ebx,%ecx,4), %eax
 270         divl    %esi
 271         loop    L(divide_top)
 272
 273
 274         movl    %edx, %eax      C x
 275         movl    %esi, %edx      C y (odd)
 276
 277         movl    %edi, %ebx      C common twos
 278         popl    %esi
 279
 280         popl    %edi
 281         leal    1(%eax), %ecx
 282
 283         orl     %eax, %eax
 284         jnz     L(strip_x_and)
 285
 286
 287         movl    %ebx, %ecx
 288         movl    %edx, %eax
 289
 290         shll    %cl, %eax
 291         popl    %ebx
 292
 293         ret
 294
 295
 296         ALIGN(8)
 297 L(modexact):
 298         C eax
 299         C ebx   src ptr
 300         C ecx   size or size-1
 301         C edx
 302         C esi   y odd
 303         C edi   common twos
 304         C ebp
 305
 306         movl    PARAM_SIZE, %eax
 307         pushl   %esi            FRAME_pushl()
 308
 309         pushl   %eax            FRAME_pushl()
 310
 311         pushl   %ebx            FRAME_pushl()
 312
 313 ifdef(`PIC',`
 314         nop     C code alignment
 315         call    L(movl_eip_ebx)
 316 L(here):
 317         addl    $_GLOBAL_OFFSET_TABLE_, %ebx
 318         call    GSYM_PREFIX`'mpn_modexact_1_odd@PLT
 319 ',`
 320         call    GSYM_PREFIX`'mpn_modexact_1_odd
 321 ')
 322
 323         movl    %esi, %edx              C y odd
 324         movl    SAVE_ESI, %esi
 325
 326         movl    %edi, %ebx              C common twos
 327         movl    SAVE_EDI, %edi
 328
 329         addl    $eval(FRAME - FRAME_TWO_OR_MORE), %esp
 330         orl     %eax, %eax
 331
 332         leal    1(%eax), %ecx
 333         jnz     L(strip_x_and)
 334
 335
 336         movl    %ebx, %ecx
 337         movl    %edx, %eax
 338
 339         shll    %cl, %eax
 340         popl    %ebx
 341
 342         ret
 343
 344
 345 ifdef(`PIC',`
 346 L(movl_eip_ebx):
 347         movl    (%esp), %ebx
 348         ret_internal
 349 ')
 350
 351 EPILOGUE()