mpn/x86/sqr_basecase.asm

   1 dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20
  21 include(`../config.m4')
  22
  23
  24 C     cycles/crossproduct  cycles/triangleproduct
  25 C P5:
  26 C P6:
  27 C K6:
  28 C K7:
  29 C P4:
  30
  31
  32 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  33 C
  34 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  35 C lot of function call overheads are avoided, especially when the size is
  36 C small.
  37 C
  38 C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
  39 C code size to do so here.
  40 C
  41 C Enhancements:
  42 C
  43 C The addmul loop here is also not unrolled like aorsmul_1.asm and
  44 C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
  45 C complexity, but if it's worth doing in the other places then it should be
  46 C worthwhile here.
  47 C
  48 C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
  49 C might be worth considering.  That'd add quite a bit to the code size, but
  50 C only as much as is used would be dragged into L1 cache.
  51
  52 defframe(PARAM_SIZE,12)
  53 defframe(PARAM_SRC, 8)
  54 defframe(PARAM_DST, 4)
  55
  56         TEXT
  57         ALIGN(8)
  58 PROLOGUE(mpn_sqr_basecase)
  59 deflit(`FRAME',0)
  60
  61         movl    PARAM_SIZE, %edx
  62
  63         movl    PARAM_SRC, %eax
  64
  65         cmpl    $2, %edx
  66         movl    PARAM_DST, %ecx
  67
  68         je      L(two_limbs)
  69         ja      L(three_or_more)
  70
  71
  72 C -----------------------------------------------------------------------------
  73 C one limb only
  74         C eax   src
  75         C ebx
  76         C ecx   dst
  77         C edx
  78
  79         movl    (%eax), %eax
  80         mull    %eax
  81         movl    %eax, (%ecx)
  82         movl    %edx, 4(%ecx)
  83         ret
  84
  85
  86 C -----------------------------------------------------------------------------
  87         ALIGN(8)
  88 L(two_limbs):
  89         C eax   src
  90         C ebx
  91         C ecx   dst
  92         C edx
  93
  94         pushl   %ebx
  95         pushl   %ebp
  96
  97         movl    %eax, %ebx
  98         movl    (%eax), %eax
  99
 100         mull    %eax            C src[0]^2
 101
 102         pushl   %esi
 103         pushl   %edi
 104
 105         movl    %edx, %esi      C dst[1]
 106         movl    %eax, (%ecx)    C dst[0]
 107
 108         movl    4(%ebx), %eax
 109         mull    %eax            C src[1]^2
 110
 111         movl    %eax, %edi      C dst[2]
 112         movl    %edx, %ebp      C dst[3]
 113
 114         movl    (%ebx), %eax
 115         mull    4(%ebx)         C src[0]*src[1]
 116
 117         addl    %eax, %esi
 118
 119         adcl    %edx, %edi
 120
 121         adcl    $0, %ebp
 122         addl    %esi, %eax
 123
 124         adcl    %edi, %edx
 125         movl    %eax, 4(%ecx)
 126
 127         adcl    $0, %ebp
 128
 129         movl    %edx, 8(%ecx)
 130         movl    %ebp, 12(%ecx)
 131
 132         popl    %edi
 133         popl    %esi
 134
 135         popl    %ebp
 136         popl    %ebx
 137
 138         ret
 139
 140
 141 C -----------------------------------------------------------------------------
 142         ALIGN(8)
 143 L(three_or_more):
 144 deflit(`FRAME',0)
 145         C eax   src
 146         C ebx
 147         C ecx   dst
 148         C edx   size
 149
 150         pushl   %ebx    FRAME_pushl()
 151         pushl   %edi    FRAME_pushl()
 152
 153         pushl   %esi    FRAME_pushl()
 154         pushl   %ebp    FRAME_pushl()
 155
 156         leal    (%ecx,%edx,4), %edi     C &dst[size], end of this mul1
 157         leal    (%eax,%edx,4), %esi     C &src[size]
 158
 159 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 160
 161         movl    (%eax), %ebp            C src[0], multiplier
 162         movl    %edx, %ecx
 163
 164         negl    %ecx                    C -size
 165         xorl    %ebx, %ebx              C clear carry limb
 166
 167         incl    %ecx                    C -(size-1)
 168
 169 L(mul1):
 170         C eax   scratch
 171         C ebx   carry
 172         C ecx   counter, limbs, negative
 173         C edx   scratch
 174         C esi   &src[size]
 175         C edi   &dst[size]
 176         C ebp   multiplier
 177
 178         movl    (%esi,%ecx,4), %eax
 179         mull    %ebp
 180         addl    %eax, %ebx
 181         adcl    $0, %edx
 182         movl    %ebx, (%edi,%ecx,4)
 183         movl    %edx, %ebx
 184         incl    %ecx
 185         jnz     L(mul1)
 186
 187         movl    %ebx, (%edi)
 188
 189
 190         C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
 191         C n=1..size-2.
 192         C
 193         C The last products src[size-2]*src[size-1], which is the end corner
 194         C of the product triangle, is handled separately at the end to save
 195         C looping overhead.  If size is 3 then it's only this that needs to
 196         C be done.
 197         C
 198         C In the outer loop %esi is a constant, and %edi just advances by 1
 199         C limb each time.  The size of the operation decreases by 1 limb
 200         C each time.
 201
 202         C eax
 203         C ebx   carry (needing carry flag added)
 204         C ecx
 205         C edx
 206         C esi   &src[size]
 207         C edi   &dst[size]
 208         C ebp
 209
 210         movl    PARAM_SIZE, %ecx
 211         subl    $3, %ecx
 212         jz      L(corner)
 213
 214         negl    %ecx
 215
 216 dnl  re-use parameter space
 217 define(VAR_OUTER,`PARAM_DST')
 218
 219 L(outer):
 220         C eax
 221         C ebx
 222         C ecx
 223         C edx   outer loop counter, -(size-3) to -1
 224         C esi   &src[size]
 225         C edi   dst, pointing at stored carry limb of previous loop
 226         C ebp
 227
 228         movl    %ecx, VAR_OUTER
 229         addl    $4, %edi                C advance dst end
 230
 231         movl    -8(%esi,%ecx,4), %ebp   C next multiplier
 232         subl    $1, %ecx
 233
 234         xorl    %ebx, %ebx              C initial carry limb
 235
 236 L(inner):
 237         C eax   scratch
 238         C ebx   carry (needing carry flag added)
 239         C ecx   counter, -n-1 to -1
 240         C edx   scratch
 241         C esi   &src[size]
 242         C edi   dst end of this addmul
 243         C ebp   multiplier
 244
 245         movl    (%esi,%ecx,4), %eax
 246         mull    %ebp
 247         addl    %ebx, %eax
 248         adcl    $0, %edx
 249         addl    %eax, (%edi,%ecx,4)
 250         adcl    $0, %edx
 251         movl    %edx, %ebx
 252         addl    $1, %ecx
 253         jl      L(inner)
 254
 255
 256         movl    %ebx, (%edi)
 257         movl    VAR_OUTER, %ecx
 258         incl    %ecx
 259         jnz     L(outer)
 260
 261
 262 L(corner):
 263         C esi   &src[size]
 264         C edi   &dst[2*size-3]
 265
 266         movl    -4(%esi), %eax
 267         mull    -8(%esi)                C src[size-1]*src[size-2]
 268         addl    %eax, 0(%edi)
 269         adcl    $0, %edx
 270         movl    %edx, 4(%edi)           C dst high limb
 271
 272
 273 C -----------------------------------------------------------------------------
 274 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 275
 276         movl    PARAM_SIZE, %eax
 277         negl    %eax
 278         addl    $1, %eax                C -(size-1) and clear carry
 279
 280 L(lshift):
 281         C eax   counter, negative
 282         C ebx   next limb
 283         C ecx
 284         C edx
 285         C esi
 286         C edi   &dst[2*size-4]
 287         C ebp
 288
 289         rcll    8(%edi,%eax,8)
 290         rcll    12(%edi,%eax,8)
 291         incl    %eax
 292         jnz     L(lshift)
 293
 294
 295         adcl    %eax, %eax              C high bit out
 296         movl    %eax, 8(%edi)           C dst most significant limb
 297
 298
 299 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
 300 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 301 C low limb of src[0]^2.
 302
 303         movl    PARAM_SRC, %esi
 304         movl    (%esi), %eax            C src[0]
 305         mull    %eax                    C src[0]^2
 306
 307         movl    PARAM_SIZE, %ecx
 308         leal    (%esi,%ecx,4), %esi     C src end
 309
 310         negl    %ecx                    C -size
 311         movl    %edx, %ebx              C initial carry
 312
 313         movl    %eax, 12(%edi,%ecx,8)   C dst[0]
 314         incl    %ecx                    C -(size-1)
 315
 316 L(diag):
 317         C eax   scratch (low product)
 318         C ebx   carry limb
 319         C ecx   counter, -(size-1) to -1
 320         C edx   scratch (high product)
 321         C esi   &src[size]
 322         C edi   &dst[2*size-3]
 323         C ebp   scratch (fetched dst limbs)
 324
 325         movl    (%esi,%ecx,4), %eax
 326         mull    %eax
 327
 328         addl    %ebx, 8(%edi,%ecx,8)
 329         movl    %edx, %ebx
 330
 331         adcl    %eax, 12(%edi,%ecx,8)
 332         adcl    $0, %ebx
 333
 334         incl    %ecx
 335         jnz     L(diag)
 336
 337
 338         addl    %ebx, 8(%edi)           C dst most significant limb
 339
 340         popl    %ebp
 341         popl    %esi
 342
 343         popl    %edi
 344         popl    %ebx
 345
 346         ret
 347
 348 EPILOGUE()