mpn/x86/pentium/mmx/mul_1.asm

   1 dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
   2
   3 dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C    cycles/limb
  24 C P5:   12.0   for 32-bit multiplier
  25 C        7.0   for 16-bit multiplier
  26
  27
  28 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
  29 C                      mp_limb_t multiplier);
  30 C
  31 C When the multiplier is 16 bits some special case MMX code is used.  Small
  32 C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
  33 C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
  34 C size==8 end up being quite close.  If src isn't aligned to an 8 byte
  35 C boundary then one limb is processed separately with roughly a 5 cycle
  36 C penalty, so in that case it's say size==8 and size==9 which are close.
  37 C
  38 C Alternatives:
  39 C
  40 C MMX is not believed to be of any use for 32-bit multipliers, since for
  41 C instance the current method would just have to be more or less duplicated
  42 C for the high and low halves of the multiplier, and would probably
  43 C therefore run at about 14 cycles, which is slower than the plain integer
  44 C at 12.
  45 C
  46 C Adding the high and low MMX products using integer code seems best.  An
  47 C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
  48 C any joy.  Perhaps something could be done keeping the values signed and
  49 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
  50 C perhaps not.
  51 C
  52 C Future:
  53 C
  54 C An mpn_mul_1c entrypoint would need a double carry out of the low result
  55 C limb in the 16-bit code, unless it could be assumed the carry fits in 16
  56 C bits, possibly as carry<multiplier, this being true of a big calculation
  57 C done piece by piece.  But let's worry about that if/when mul_1c is
  58 C actually used.
  59
  60 defframe(PARAM_MULTIPLIER,16)
  61 defframe(PARAM_SIZE,      12)
  62 defframe(PARAM_SRC,       8)
  63 defframe(PARAM_DST,       4)
  64
  65         TEXT
  66
  67         ALIGN(8)
  68 PROLOGUE(mpn_mul_1)
  69 deflit(`FRAME',0)
  70
  71         movl    PARAM_SIZE, %ecx
  72         movl    PARAM_SRC, %edx
  73
  74         cmpl    $1, %ecx
  75         jne     L(two_or_more)
  76
  77         C one limb only
  78
  79         movl    PARAM_MULTIPLIER, %eax
  80         movl    PARAM_DST, %ecx
  81
  82         mull    (%edx)
  83
  84         movl    %eax, (%ecx)
  85         movl    %edx, %eax
  86
  87         ret
  88
  89
  90 L(two_or_more):
  91         C eax   size
  92         C ebx
  93         C ecx   carry
  94         C edx
  95         C esi   src
  96         C edi
  97         C ebp
  98
  99         pushl   %esi            FRAME_pushl()
 100         pushl   %edi            FRAME_pushl()
 101
 102         movl    %edx, %esi              C src
 103         movl    PARAM_DST, %edi
 104
 105         movl    PARAM_MULTIPLIER, %eax
 106         pushl   %ebx            FRAME_pushl()
 107
 108         leal    (%esi,%ecx,4), %esi     C src end
 109         leal    (%edi,%ecx,4), %edi     C dst end
 110
 111         negl    %ecx                    C -size
 112
 113         pushl   %ebp            FRAME_pushl()
 114         cmpl    $65536, %eax
 115
 116         jb      L(small)
 117
 118
 119 L(big):
 120         xorl    %ebx, %ebx              C carry limb
 121         sarl    %ecx                    C -size/2
 122
 123         jnc     L(top)                  C with carry flag clear
 124
 125
 126         C size was odd, process one limb separately
 127
 128         mull    4(%esi,%ecx,8)          C m * src[0]
 129
 130         movl    %eax, 4(%edi,%ecx,8)
 131         incl    %ecx
 132
 133         orl     %edx, %ebx              C carry limb, and clear carry flag
 134
 135
 136 L(top):
 137         C eax
 138         C ebx   carry
 139         C ecx   counter, negative
 140         C edx
 141         C esi   src end
 142         C edi   dst end
 143         C ebp   (scratch carry)
 144
 145         adcl    $0, %ebx
 146         movl    (%esi,%ecx,8), %eax
 147
 148         mull    PARAM_MULTIPLIER
 149
 150         movl    %edx, %ebp
 151         addl    %eax, %ebx
 152
 153         adcl    $0, %ebp
 154         movl    4(%esi,%ecx,8), %eax
 155
 156         mull    PARAM_MULTIPLIER
 157
 158         movl    %ebx, (%edi,%ecx,8)
 159         addl    %ebp, %eax
 160
 161         movl    %eax, 4(%edi,%ecx,8)
 162         incl    %ecx
 163
 164         movl    %edx, %ebx
 165         jnz     L(top)
 166
 167
 168         adcl    $0, %ebx
 169         popl    %ebp
 170
 171         movl    %ebx, %eax
 172         popl    %ebx
 173
 174         popl    %edi
 175         popl    %esi
 176
 177         ret
 178
 179
 180 L(small):
 181         C Special case for 16-bit multiplier.
 182         C
 183         C eax   multiplier
 184         C ebx
 185         C ecx   -size
 186         C edx   src
 187         C esi   src end
 188         C edi   dst end
 189         C ebp   multiplier
 190
 191         C size<3 not supported here.  At size==3 we're already a couple of
 192         C cycles faster, so there's no threshold as such, just use the MMX
 193         C as soon as possible.
 194
 195         cmpl    $-3, %ecx
 196         ja      L(big)
 197
 198         movd    %eax, %mm7              C m
 199         pxor    %mm6, %mm6              C initial carry word
 200
 201         punpcklwd %mm7, %mm7            C m replicated 2 times
 202         addl    $2, %ecx                C -size+2
 203
 204         punpckldq %mm7, %mm7            C m replicated 4 times
 205         andl    $4, %edx                C test alignment, clear carry flag
 206
 207         movq    %mm7, %mm0              C m
 208         jz      L(small_entry)
 209
 210
 211         C Source is unaligned, process one limb separately.
 212         C
 213         C Plain integer code is used here, since it's smaller and is about
 214         C the same 13 cycles as an mmx block would be.
 215         C
 216         C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
 217         C the use of separate incl and orl.
 218
 219         mull    -8(%esi,%ecx,4)         C m * src[0]
 220
 221         movl    %eax, -8(%edi,%ecx,4)   C dst[0]
 222         incl    %ecx                    C one limb processed
 223
 224         movd    %edx, %mm6              C initial carry
 225
 226         orl     %eax, %eax              C clear carry flag
 227         jmp     L(small_entry)
 228
 229
 230 C The scheduling here is quite tricky, since so many instructions have
 231 C pairing restrictions.  In particular the js won't pair with a movd, and
 232 C can't be paired with an adc since it wants flags from the inc, so
 233 C instructions are rotated to the top of the loop to find somewhere useful
 234 C for it.
 235 C
 236 C Trouble has been taken to avoid overlapping successive loop iterations,
 237 C since that would greatly increase the size of the startup and finishup
 238 C code.  Actually there's probably not much advantage to be had from
 239 C overlapping anyway, since the difficulties are mostly with pairing, not
 240 C with latencies as such.
 241 C
 242 C In the comments x represents the src data and m the multiplier (16
 243 C bits, but replicated 4 times).
 244 C
 245 C The m signs calculated in %mm3 are a loop invariant and could be held in
 246 C say %mm5, but that would save only one instruction and hence be no faster.
 247
 248 L(small_top):
 249         C eax   l.low, then l.high
 250         C ebx   (h.low)
 251         C ecx   counter, -size+2 to 0 or 1
 252         C edx   (h.high)
 253         C esi   &src[size]
 254         C edi   &dst[size]
 255         C ebp
 256         C
 257         C %mm0  (high products)
 258         C %mm1  (low products)
 259         C %mm2  (adjust for m using x signs)
 260         C %mm3  (adjust for x using m signs)
 261         C %mm4
 262         C %mm5
 263         C %mm6  h.low, then carry
 264         C %mm7  m replicated 4 times
 265
 266         movd    %mm6, %ebx              C h.low
 267         psrlq   $32, %mm1               C l.high
 268
 269         movd    %mm0, %edx              C h.high
 270         movq    %mm0, %mm6              C new c
 271
 272         adcl    %eax, %ebx
 273         incl    %ecx
 274
 275         movd    %mm1, %eax              C l.high
 276         movq    %mm7, %mm0
 277
 278         adcl    %eax, %edx
 279         movl    %ebx, -16(%edi,%ecx,4)
 280
 281         movl    %edx, -12(%edi,%ecx,4)
 282         psrlq   $32, %mm6               C c
 283
 284 L(small_entry):
 285         pmulhw  -8(%esi,%ecx,4), %mm0   C h = (x*m).high
 286         movq    %mm7, %mm1
 287
 288         pmullw  -8(%esi,%ecx,4), %mm1   C l = (x*m).low
 289         movq    %mm7, %mm3
 290
 291         movq    -8(%esi,%ecx,4), %mm2   C x
 292         psraw   $15, %mm3               C m signs
 293
 294         pand    -8(%esi,%ecx,4), %mm3   C x selected by m signs
 295         psraw   $15, %mm2               C x signs
 296
 297         paddw   %mm3, %mm0              C add x to h if m neg
 298         pand    %mm7, %mm2              C m selected by x signs
 299
 300         paddw   %mm2, %mm0              C add m to h if x neg
 301         incl    %ecx
 302
 303         movd    %mm1, %eax              C l.low
 304         punpcklwd %mm0, %mm6            C c + h.low << 16
 305
 306         psrlq   $16, %mm0               C h.high
 307         js      L(small_top)
 308
 309
 310
 311
 312         movd    %mm6, %ebx              C h.low
 313         psrlq   $32, %mm1               C l.high
 314
 315         adcl    %eax, %ebx
 316         popl    %ebp            FRAME_popl()
 317
 318         movd    %mm0, %edx              C h.high
 319         psrlq   $32, %mm0               C l.high
 320
 321         movd    %mm1, %eax              C l.high
 322
 323         adcl    %eax, %edx
 324         movl    %ebx, -12(%edi,%ecx,4)
 325
 326         movd    %mm0, %eax              C c
 327
 328         adcl    $0, %eax
 329         movl    %edx, -8(%edi,%ecx,4)
 330
 331         orl     %ecx, %ecx
 332         jnz     L(small_done)           C final %ecx==1 means even, ==0 odd
 333
 334
 335         C Size odd, one extra limb to process.
 336         C Plain integer code is used here, since it's smaller and is about
 337         C the same speed as another mmx block would be.
 338
 339         movl    %eax, %ecx
 340         movl    PARAM_MULTIPLIER, %eax
 341
 342         mull    -4(%esi)
 343
 344         addl    %ecx, %eax
 345
 346         adcl    $0, %edx
 347         movl    %eax, -4(%edi)
 348
 349         movl    %edx, %eax
 350 L(small_done):
 351         popl    %ebx
 352
 353         popl    %edi
 354         popl    %esi
 355
 356         emms
 357
 358         ret
 359
 360 EPILOGUE()