mpn/x86/k6/mul_1.asm

   1 dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
   2
   3 dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C                           cycles/limb
  24 C P5:
  25 C P6 model 0-8,10-12)            5.5
  26 C P6 model 9  (Banias)
  27 C P6 model 13 (Dothan)           4.87
  28 C P4 model 0  (Willamette)
  29 C P4 model 1  (?)
  30 C P4 model 2  (Northwood)
  31 C P4 model 3  (Prescott)
  32 C P4 model 4  (Nocona)
  33 C K6:                            6.25
  34 C K7:
  35 C K8:
  36
  37
  38 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  39 C                      mp_limb_t multiplier);
  40 C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
  41 C                       mp_limb_t multiplier, mp_limb_t carry);
  42 C
  43 C Multiply src,size by mult and store the result in dst,size.
  44 C Return the carry limb from the top of the result.
  45 C
  46 C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
  47 C the low limb of the result.
  48
  49 defframe(PARAM_CARRY,     20)
  50 defframe(PARAM_MULTIPLIER,16)
  51 defframe(PARAM_SIZE,      12)
  52 defframe(PARAM_SRC,       8)
  53 defframe(PARAM_DST,       4)
  54
  55 dnl  minimum 5 because the unrolled code can't handle less
  56 deflit(UNROLL_THRESHOLD, 5)
  57
  58         TEXT
  59         ALIGN(32)
  60
  61 PROLOGUE(mpn_mul_1c)
  62         pushl   %esi
  63 deflit(`FRAME',4)
  64         movl    PARAM_CARRY, %esi
  65         jmp     L(start_nc)
  66 EPILOGUE()
  67
  68
  69 PROLOGUE(mpn_mul_1)
  70         push    %esi
  71 deflit(`FRAME',4)
  72         xorl    %esi, %esi      C initial carry
  73
  74 L(start_nc):
  75         mov     PARAM_SIZE, %ecx
  76         push    %ebx
  77 FRAME_pushl()
  78
  79         movl    PARAM_SRC, %ebx
  80         push    %edi
  81 FRAME_pushl()
  82
  83         movl    PARAM_DST, %edi
  84         pushl   %ebp
  85 FRAME_pushl()
  86
  87         cmpl    $UNROLL_THRESHOLD, %ecx
  88         movl    PARAM_MULTIPLIER, %ebp
  89
  90         jae     L(unroll)
  91
  92
  93         C code offset 0x22 here, close enough to aligned
  94 L(simple):
  95         C eax   scratch
  96         C ebx   src
  97         C ecx   counter
  98         C edx   scratch
  99         C esi   carry
 100         C edi   dst
 101         C ebp   multiplier
 102         C
 103         C this loop 8 cycles/limb
 104
 105         movl    (%ebx), %eax
 106         addl    $4, %ebx
 107
 108         mull    %ebp
 109
 110         addl    %esi, %eax
 111         movl    $0, %esi
 112
 113         adcl    %edx, %esi
 114
 115         movl    %eax, (%edi)
 116         addl    $4, %edi
 117
 118         loop    L(simple)
 119
 120
 121         popl    %ebp
 122
 123         popl    %edi
 124         popl    %ebx
 125
 126         movl    %esi, %eax
 127         popl    %esi
 128
 129         ret
 130
 131
 132 C -----------------------------------------------------------------------------
 133 C The code for each limb is 6 cycles, with instruction decoding being the
 134 C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
 135 C cycles/limb in total.
 136 C
 137 C The secret ingredient to get 6.25 is to start the loop with the mul and
 138 C have the load/store pair at the end.  Rotating the load/store to the top
 139 C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
 140 C
 141 C The whole unrolled loop fits nicely in exactly 80 bytes.
 142
 143
 144         ALIGN(16)       C already aligned to 16 here actually
 145 L(unroll):
 146         movl    (%ebx), %eax
 147         leal    -16(%ebx,%ecx,4), %ebx
 148
 149         leal    -16(%edi,%ecx,4), %edi
 150         subl    $4, %ecx
 151
 152         negl    %ecx
 153
 154
 155         ALIGN(16)       C one byte nop for this alignment
 156 L(top):
 157         C eax   scratch
 158         C ebx   &src[size-4]
 159         C ecx   counter
 160         C edx   scratch
 161         C esi   carry
 162         C edi   &dst[size-4]
 163         C ebp   multiplier
 164
 165         mull    %ebp
 166
 167         addl    %esi, %eax
 168         movl    $0, %esi
 169
 170         adcl    %edx, %esi
 171
 172         movl    %eax, (%edi,%ecx,4)
 173         movl    4(%ebx,%ecx,4), %eax
 174
 175
 176         mull    %ebp
 177
 178         addl    %esi, %eax
 179         movl    $0, %esi
 180
 181         adcl    %edx, %esi
 182
 183         movl    %eax, 4(%edi,%ecx,4)
 184         movl    8(%ebx,%ecx,4), %eax
 185
 186
 187         mull    %ebp
 188
 189         addl    %esi, %eax
 190         movl    $0, %esi
 191
 192         adcl    %edx, %esi
 193
 194         movl    %eax, 8(%edi,%ecx,4)
 195         movl    12(%ebx,%ecx,4), %eax
 196
 197
 198         mull    %ebp
 199
 200         addl    %esi, %eax
 201         movl    $0, %esi
 202
 203         adcl    %edx, %esi
 204
 205         movl    %eax, 12(%edi,%ecx,4)
 206         movl    16(%ebx,%ecx,4), %eax
 207
 208
 209         addl    $4, %ecx
 210         js      L(top)
 211
 212
 213
 214         C eax   next src limb
 215         C ebx   &src[size-4]
 216         C ecx   0 to 3 representing respectively 4 to 1 further limbs
 217         C edx
 218         C esi   carry
 219         C edi   &dst[size-4]
 220
 221         testb   $2, %cl
 222         jnz     L(finish_not_two)
 223
 224         mull    %ebp
 225
 226         addl    %esi, %eax
 227         movl    $0, %esi
 228
 229         adcl    %edx, %esi
 230
 231         movl    %eax, (%edi,%ecx,4)
 232         movl    4(%ebx,%ecx,4), %eax
 233
 234
 235         mull    %ebp
 236
 237         addl    %esi, %eax
 238         movl    $0, %esi
 239
 240         adcl    %edx, %esi
 241
 242         movl    %eax, 4(%edi,%ecx,4)
 243         movl    8(%ebx,%ecx,4), %eax
 244
 245         addl    $2, %ecx
 246 L(finish_not_two):
 247
 248
 249         testb   $1, %cl
 250         jnz     L(finish_not_one)
 251
 252         mull    %ebp
 253
 254         addl    %esi, %eax
 255         movl    $0, %esi
 256
 257         adcl    %edx, %esi
 258
 259         movl    %eax, 8(%edi)
 260         movl    12(%ebx), %eax
 261 L(finish_not_one):
 262
 263
 264         mull    %ebp
 265
 266         addl    %esi, %eax
 267         popl    %ebp
 268
 269         adcl    $0, %edx
 270
 271         movl    %eax, 12(%edi)
 272         popl    %edi
 273
 274         popl    %ebx
 275         movl    %edx, %eax
 276
 277         popl    %esi
 278
 279         ret
 280
 281 EPILOGUE()