mpn/x86/k6/aorsmul_1.asm

   1 dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation,
   4 dnl  Inc.
   5 dnl
   6 dnl  This file is part of the GNU MP Library.
   7 dnl
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or
   9 dnl  modify it under the terms of the GNU Lesser General Public License as
  10 dnl  published by the Free Software Foundation; either version 3 of the
  11 dnl  License, or (at your option) any later version.
  12 dnl
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  14 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 dnl  Lesser General Public License for more details.
  17 dnl
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23
  24 C                           cycles/limb
  25 C P5:
  26 C P6 model 0-8,10-12)            5.94
  27 C P6 model 9  (Banias)
  28 C P6 model 13 (Dothan)           5.57
  29 C P4 model 0  (Willamette)
  30 C P4 model 1  (?)
  31 C P4 model 2  (Northwood)
  32 C P4 model 3  (Prescott)
  33 C P4 model 4  (Nocona)
  34 C K6:                           7.65-8.5 (data dependent)
  35 C K7:
  36 C K8:
  37
  38
  39 dnl  K6:           large multipliers  small multipliers
  40 dnl  UNROLL_COUNT    cycles/limb       cycles/limb
  41 dnl        4             9.5              7.78
  42 dnl        8             9.0              7.78
  43 dnl       16             8.4              7.65
  44 dnl       32             8.4              8.2
  45 dnl
  46 dnl  Maximum possible unrolling with the current code is 32.
  47 dnl
  48 dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
  49 dnl  byte block, which might explain the good speed at that unrolling.
  50
  51 deflit(UNROLL_COUNT, 16)
  52
  53
  54 ifdef(`OPERATION_addmul_1', `
  55         define(M4_inst,        addl)
  56         define(M4_function_1,  mpn_addmul_1)
  57         define(M4_function_1c, mpn_addmul_1c)
  58 ',`ifdef(`OPERATION_submul_1', `
  59         define(M4_inst,        subl)
  60         define(M4_function_1,  mpn_submul_1)
  61         define(M4_function_1c, mpn_submul_1c)
  62 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
  63 ')')')
  64
  65 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
  66
  67
  68 C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  69 C                         mp_limb_t mult);
  70 C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  71 C                          mp_limb_t mult, mp_limb_t carry);
  72 C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  73 C                         mp_limb_t mult);
  74 C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  75 C                          mp_limb_t mult, mp_limb_t carry);
  76 C
  77 C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
  78 C multipliers (most significant few bits clear) result in few carry bits and
  79 C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
  80 C significant few bits set) make the carry bits 50/50 and lead to something
  81 C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
  82 C
  83 C It's important that the gains for jadcl0 on small multipliers don't come
  84 C at the cost of slowing down other data.  Tests on uniformly distributed
  85 C random data, designed to confound branch prediction, show about a 7%
  86 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
  87 C overheads included).
  88 C
  89 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
  90 C 11.0 cycles/limb), and hence isn't used.
  91 C
  92 C In the simple loop, note that running ecx from negative to zero and using
  93 C it as an index in the two movs wouldn't help.  It would save one
  94 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
  95 C that would be collapsed by this.
  96 C
  97 C Attempts at a simpler main loop, with less unrolling, haven't yielded much
  98 C success, generally running over 9 c/l.
  99 C
 100 C
 101 C jadcl0
 102 C ------
 103 C
 104 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
 105 C firstly the instruction decoding and secondly the fact that there's a
 106 C carry bit for the jadcl0 only on average about 1/4 of the time.
 107 C
 108 C The code in the unrolled loop decodes something like the following.
 109 C
 110 C                                         decode cycles
 111 C               mull    %ebp                    2
 112 C               M4_inst %esi, disp(%edi)        1
 113 C               adcl    %eax, %ecx              2
 114 C               movl    %edx, %esi            \ 1
 115 C               jnc     1f                    /
 116 C               incl    %esi                  \ 1
 117 C       1:      movl    disp(%ebx), %eax      /
 118 C                                              ---
 119 C                                               7
 120 C
 121 C In a back-to-back style test this measures 7 with the jnc not taken, or 8
 122 C with it taken (both when correctly predicted).  This is opposite to the
 123 C measurements showing small multipliers running faster than large ones.
 124 C Don't really know why.
 125 C
 126 C It's not clear how much branch misprediction might be costing.  The K6
 127 C doco says it will be 1 to 4 cycles, but presumably it's near the low end
 128 C of that range to get the measured results.
 129 C
 130 C
 131 C In the code the two carries are more or less the preceding mul product and
 132 C the calculation is roughly
 133 C
 134 C       x*y + u*b+v
 135 C
 136 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
 137 C v are the two limbs it's added to (being the low of the next mul, and a
 138 C limb from the destination).
 139 C
 140 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
 141 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
 142 C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
 143 C and b-1, then the total probability can be summed over x and y,
 144 C
 145 C        1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
 146 C       --- * sum sum --- = --- * ------- * ------- = 1/4
 147 C       b^2   x=0 y=1 b^2   b^4      2         2
 148 C
 149 C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
 150 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
 151
 152
 153 ifdef(`PIC',`
 154 deflit(UNROLL_THRESHOLD, 9)
 155 ',`
 156 deflit(UNROLL_THRESHOLD, 6)
 157 ')
 158
 159 defframe(PARAM_CARRY,     20)
 160 defframe(PARAM_MULTIPLIER,16)
 161 defframe(PARAM_SIZE,      12)
 162 defframe(PARAM_SRC,       8)
 163 defframe(PARAM_DST,       4)
 164
 165         TEXT
 166         ALIGN(32)
 167
 168 PROLOGUE(M4_function_1c)
 169         pushl   %esi
 170 deflit(`FRAME',4)
 171         movl    PARAM_CARRY, %esi
 172         jmp     L(start_nc)
 173 EPILOGUE()
 174
 175 PROLOGUE(M4_function_1)
 176         push    %esi
 177 deflit(`FRAME',4)
 178         xorl    %esi, %esi      C initial carry
 179
 180 L(start_nc):
 181         movl    PARAM_SIZE, %ecx
 182         pushl   %ebx
 183 deflit(`FRAME',8)
 184
 185         movl    PARAM_SRC, %ebx
 186         pushl   %edi
 187 deflit(`FRAME',12)
 188
 189         cmpl    $UNROLL_THRESHOLD, %ecx
 190         movl    PARAM_DST, %edi
 191
 192         pushl   %ebp
 193 deflit(`FRAME',16)
 194         jae     L(unroll)
 195
 196
 197         C simple loop
 198
 199         movl    PARAM_MULTIPLIER, %ebp
 200
 201 L(simple):
 202         C eax   scratch
 203         C ebx   src
 204         C ecx   counter
 205         C edx   scratch
 206         C esi   carry
 207         C edi   dst
 208         C ebp   multiplier
 209
 210         movl    (%ebx), %eax
 211         addl    $4, %ebx
 212
 213         mull    %ebp
 214
 215         addl    $4, %edi
 216         addl    %esi, %eax
 217
 218         adcl    $0, %edx
 219
 220         M4_inst %eax, -4(%edi)
 221
 222         adcl    $0, %edx
 223
 224         movl    %edx, %esi
 225         loop    L(simple)
 226
 227
 228         popl    %ebp
 229         popl    %edi
 230
 231         popl    %ebx
 232         movl    %esi, %eax
 233
 234         popl    %esi
 235         ret
 236
 237
 238
 239 C -----------------------------------------------------------------------------
 240 C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
 241 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
 242 C For the computed jump an odd size means they start one way around, an even
 243 C size the other.
 244 C
 245 C VAR_JUMP holds the computed jump temporarily because there's not enough
 246 C registers at the point of doing the mul for the initial two carry limbs.
 247 C
 248 C The add/adc for the initial carry in %esi is necessary only for the
 249 C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
 250 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
 251 C idea.
 252
 253 dnl  overlapping with parameters already fetched
 254 define(VAR_COUNTER, `PARAM_SIZE')
 255 define(VAR_JUMP,    `PARAM_DST')
 256
 257 L(unroll):
 258         C eax
 259         C ebx   src
 260         C ecx   size
 261         C edx
 262         C esi   initial carry
 263         C edi   dst
 264         C ebp
 265
 266         movl    %ecx, %edx
 267         decl    %ecx
 268
 269         subl    $2, %edx
 270         negl    %ecx
 271
 272         shrl    $UNROLL_LOG2, %edx
 273         andl    $UNROLL_MASK, %ecx
 274
 275         movl    %edx, VAR_COUNTER
 276         movl    %ecx, %edx
 277
 278         shll    $4, %edx
 279         negl    %ecx
 280
 281         C 15 code bytes per limb
 282 ifdef(`PIC',`
 283         call    L(pic_calc)
 284 L(here):
 285 ',`
 286         leal    L(entry) (%edx,%ecx,1), %edx
 287 ')
 288         movl    (%ebx), %eax            C src low limb
 289
 290         movl    PARAM_MULTIPLIER, %ebp
 291         movl    %edx, VAR_JUMP
 292
 293         mull    %ebp
 294
 295         addl    %esi, %eax      C initial carry (from _1c)
 296         jadcl0( %edx)
 297
 298
 299         leal    4(%ebx,%ecx,4), %ebx
 300         movl    %edx, %esi      C high carry
 301
 302         movl    VAR_JUMP, %edx
 303         leal    (%edi,%ecx,4), %edi
 304
 305         testl   $1, %ecx
 306         movl    %eax, %ecx      C low carry
 307
 308         jz      L(noswap)
 309         movl    %esi, %ecx      C high,low carry other way around
 310
 311         movl    %eax, %esi
 312 L(noswap):
 313
 314         jmp     *%edx
 315
 316
 317 ifdef(`PIC',`
 318 L(pic_calc):
 319         C See mpn/x86/README about old gas bugs
 320         leal    (%edx,%ecx,1), %edx
 321         addl    $L(entry)-L(here), %edx
 322         addl    (%esp), %edx
 323         ret_internal
 324 ')
 325
 326
 327 C -----------------------------------------------------------
 328         ALIGN(32)
 329 L(top):
 330 deflit(`FRAME',16)
 331         C eax   scratch
 332         C ebx   src
 333         C ecx   carry lo
 334         C edx   scratch
 335         C esi   carry hi
 336         C edi   dst
 337         C ebp   multiplier
 338         C
 339         C 15 code bytes per limb
 340
 341         leal    UNROLL_BYTES(%edi), %edi
 342
 343 L(entry):
 344 forloop(`i', 0, UNROLL_COUNT/2-1, `
 345         deflit(`disp0', eval(2*i*4))
 346         deflit(`disp1', eval(disp0 + 4))
 347
 348 Zdisp(  movl,   disp0,(%ebx), %eax)
 349         mull    %ebp
 350 Zdisp(  M4_inst,%ecx, disp0,(%edi))
 351         adcl    %eax, %esi
 352         movl    %edx, %ecx
 353         jadcl0( %ecx)
 354
 355         movl    disp1(%ebx), %eax
 356         mull    %ebp
 357         M4_inst %esi, disp1(%edi)
 358         adcl    %eax, %ecx
 359         movl    %edx, %esi
 360         jadcl0( %esi)
 361 ')
 362
 363         decl    VAR_COUNTER
 364
 365         leal    UNROLL_BYTES(%ebx), %ebx
 366         jns     L(top)
 367
 368
 369         popl    %ebp
 370         M4_inst %ecx, UNROLL_BYTES(%edi)
 371
 372         popl    %edi
 373         movl    %esi, %eax
 374
 375         popl    %ebx
 376         jadcl0( %eax)
 377
 378         popl    %esi
 379         ret
 380
 381 EPILOGUE()