mpn/x86/p6/aorsmul_1.asm

   1 dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C                           cycles/limb
  24 C P5:
  25 C P6 model 0-8,10-12)            6.44
  26 C P6 model 9  (Banias)
  27 C P6 model 13 (Dothan)           6.11
  28 C P4 model 0  (Willamette)
  29 C P4 model 1  (?)
  30 C P4 model 2  (Northwood)
  31 C P4 model 3  (Prescott)
  32 C P4 model 4  (Nocona)
  33 C K6:
  34 C K7:
  35 C K8:
  36
  37
  38 dnl  P6 UNROLL_COUNT cycles/limb
  39 dnl          8           6.7
  40 dnl         16           6.35
  41 dnl         32           6.3
  42 dnl         64           6.3
  43 dnl  Maximum possible with the current code is 64.
  44
  45 deflit(UNROLL_COUNT, 16)
  46
  47
  48 ifdef(`OPERATION_addmul_1', `
  49         define(M4_inst,        addl)
  50         define(M4_function_1,  mpn_addmul_1)
  51         define(M4_function_1c, mpn_addmul_1c)
  52         define(M4_description, add it to)
  53         define(M4_desc_retval, carry)
  54 ',`ifdef(`OPERATION_submul_1', `
  55         define(M4_inst,        subl)
  56         define(M4_function_1,  mpn_submul_1)
  57         define(M4_function_1c, mpn_submul_1c)
  58         define(M4_description, subtract it from)
  59         define(M4_desc_retval, borrow)
  60 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
  61 ')')')
  62
  63 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
  64
  65
  66 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  67 C                            mp_limb_t mult);
  68 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  69 C                             mp_limb_t mult, mp_limb_t carry);
  70 C
  71 C Calculate src,size multiplied by mult and M4_description dst,size.
  72 C Return the M4_desc_retval limb from the top of the result.
  73 C
  74 C This code is pretty much the same as the K6 code.  The unrolled loop is
  75 C the same, but there's just a few scheduling tweaks in the setups and the
  76 C simple loop.
  77 C
  78 C A number of variations have been tried for the unrolled loop, with one or
  79 C two carries, and with loads scheduled earlier, but nothing faster than 6
  80 C cycles/limb has been found.
  81
  82 ifdef(`PIC',`
  83 deflit(UNROLL_THRESHOLD, 5)
  84 ',`
  85 deflit(UNROLL_THRESHOLD, 5)
  86 ')
  87
  88 defframe(PARAM_CARRY,     20)
  89 defframe(PARAM_MULTIPLIER,16)
  90 defframe(PARAM_SIZE,      12)
  91 defframe(PARAM_SRC,       8)
  92 defframe(PARAM_DST,       4)
  93
  94         TEXT
  95         ALIGN(32)
  96
  97 PROLOGUE(M4_function_1c)
  98         pushl   %ebx
  99 deflit(`FRAME',4)
 100         movl    PARAM_CARRY, %ebx
 101         jmp     L(start_nc)
 102 EPILOGUE()
 103
 104 PROLOGUE(M4_function_1)
 105         push    %ebx
 106 deflit(`FRAME',4)
 107         xorl    %ebx, %ebx      C initial carry
 108
 109 L(start_nc):
 110         movl    PARAM_SIZE, %ecx
 111         pushl   %esi
 112 deflit(`FRAME',8)
 113
 114         movl    PARAM_SRC, %esi
 115         pushl   %edi
 116 deflit(`FRAME',12)
 117
 118         movl    PARAM_DST, %edi
 119         pushl   %ebp
 120 deflit(`FRAME',16)
 121         cmpl    $UNROLL_THRESHOLD, %ecx
 122
 123         movl    PARAM_MULTIPLIER, %ebp
 124         jae     L(unroll)
 125
 126
 127         C simple loop
 128         C this is offset 0x22, so close enough to aligned
 129 L(simple):
 130         C eax   scratch
 131         C ebx   carry
 132         C ecx   counter
 133         C edx   scratch
 134         C esi   src
 135         C edi   dst
 136         C ebp   multiplier
 137
 138         movl    (%esi), %eax
 139         addl    $4, %edi
 140
 141         mull    %ebp
 142
 143         addl    %ebx, %eax
 144         adcl    $0, %edx
 145
 146         M4_inst %eax, -4(%edi)
 147         movl    %edx, %ebx
 148
 149         adcl    $0, %ebx
 150         decl    %ecx
 151
 152         leal    4(%esi), %esi
 153         jnz     L(simple)
 154
 155
 156         popl    %ebp
 157         popl    %edi
 158
 159         popl    %esi
 160         movl    %ebx, %eax
 161
 162         popl    %ebx
 163         ret
 164
 165
 166
 167 C------------------------------------------------------------------------------
 168 C VAR_JUMP holds the computed jump temporarily because there's not enough
 169 C registers when doing the mul for the initial two carry limbs.
 170 C
 171 C The add/adc for the initial carry in %ebx is necessary only for the
 172 C mpn_add/submul_1c entry points.  Duplicating the startup code to
 173 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
 174 C idea.
 175
 176 dnl  overlapping with parameters already fetched
 177 define(VAR_COUNTER,`PARAM_SIZE')
 178 define(VAR_JUMP,   `PARAM_DST')
 179
 180         C this is offset 0x43, so close enough to aligned
 181 L(unroll):
 182         C eax
 183         C ebx   initial carry
 184         C ecx   size
 185         C edx
 186         C esi   src
 187         C edi   dst
 188         C ebp
 189
 190         movl    %ecx, %edx
 191         decl    %ecx
 192
 193         subl    $2, %edx
 194         negl    %ecx
 195
 196         shrl    $UNROLL_LOG2, %edx
 197         andl    $UNROLL_MASK, %ecx
 198
 199         movl    %edx, VAR_COUNTER
 200         movl    %ecx, %edx
 201
 202         C 15 code bytes per limb
 203 ifdef(`PIC',`
 204         call    L(pic_calc)
 205 L(here):
 206 ',`
 207         shll    $4, %edx
 208         negl    %ecx
 209
 210         leal    L(entry) (%edx,%ecx,1), %edx
 211 ')
 212         movl    (%esi), %eax            C src low limb
 213
 214         movl    %edx, VAR_JUMP
 215         leal    ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
 216
 217         mull    %ebp
 218
 219         addl    %ebx, %eax      C initial carry (from _1c)
 220         adcl    $0, %edx
 221
 222         movl    %edx, %ebx      C high carry
 223         leal    ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
 224
 225         movl    VAR_JUMP, %edx
 226         testl   $1, %ecx
 227         movl    %eax, %ecx      C low carry
 228
 229         cmovnz( %ebx, %ecx)     C high,low carry other way around
 230         cmovnz( %eax, %ebx)
 231
 232         jmp     *%edx
 233
 234
 235 ifdef(`PIC',`
 236 L(pic_calc):
 237         shll    $4, %edx
 238         negl    %ecx
 239
 240         C See mpn/x86/README about old gas bugs
 241         leal    (%edx,%ecx,1), %edx
 242         addl    $L(entry)-L(here), %edx
 243
 244         addl    (%esp), %edx
 245
 246         ret_internal
 247 ')
 248
 249
 250 C -----------------------------------------------------------
 251         ALIGN(32)
 252 L(top):
 253 deflit(`FRAME',16)
 254         C eax   scratch
 255         C ebx   carry hi
 256         C ecx   carry lo
 257         C edx   scratch
 258         C esi   src
 259         C edi   dst
 260         C ebp   multiplier
 261         C
 262         C VAR_COUNTER   loop counter
 263         C
 264         C 15 code bytes per limb
 265
 266         addl    $UNROLL_BYTES, %edi
 267
 268 L(entry):
 269 deflit(CHUNK_COUNT,2)
 270 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
 271         deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
 272         deflit(`disp1', eval(disp0 + 4))
 273
 274 Zdisp(  movl,   disp0,(%esi), %eax)
 275         mull    %ebp
 276 Zdisp(  M4_inst,%ecx, disp0,(%edi))
 277         adcl    %eax, %ebx
 278         movl    %edx, %ecx
 279         adcl    $0, %ecx
 280
 281         movl    disp1(%esi), %eax
 282         mull    %ebp
 283         M4_inst %ebx, disp1(%edi)
 284         adcl    %eax, %ecx
 285         movl    %edx, %ebx
 286         adcl    $0, %ebx
 287 ')
 288
 289         decl    VAR_COUNTER
 290         leal    UNROLL_BYTES(%esi), %esi
 291
 292         jns     L(top)
 293
 294
 295 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
 296
 297         M4_inst %ecx, disp0(%edi)
 298         movl    %ebx, %eax
 299
 300         popl    %ebp
 301         popl    %edi
 302
 303         popl    %esi
 304         popl    %ebx
 305         adcl    $0, %eax
 306
 307         ret
 308
 309 EPILOGUE()