mpn/x86/pentium/sqr_basecase.asm

   1 dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
   2
   3 dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
   4 dnl
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or
   8 dnl  modify it under the terms of the GNU Lesser General Public License as
   9 dnl  published by the Free Software Foundation; either version 3 of the
  10 dnl  License, or (at your option) any later version.
  11 dnl
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  13 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 dnl  Lesser General Public License for more details.
  16 dnl
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20 include(`../config.m4')
  21
  22
  23 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
  24 C product at around 20x20 limbs.
  25
  26
  27 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
  28 C
  29 C Calculate src,size squared, storing the result in dst,2*size.
  30 C
  31 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
  32 C lot of function call overheads are avoided, especially when the size is
  33 C small.
  34
  35 defframe(PARAM_SIZE,12)
  36 defframe(PARAM_SRC, 8)
  37 defframe(PARAM_DST, 4)
  38
  39         TEXT
  40         ALIGN(8)
  41 PROLOGUE(mpn_sqr_basecase)
  42 deflit(`FRAME',0)
  43
  44         movl    PARAM_SIZE, %edx
  45         movl    PARAM_SRC, %eax
  46
  47         cmpl    $2, %edx
  48         movl    PARAM_DST, %ecx
  49
  50         je      L(two_limbs)
  51
  52         movl    (%eax), %eax
  53         ja      L(three_or_more)
  54
  55 C -----------------------------------------------------------------------------
  56 C one limb only
  57         C eax   src
  58         C ebx
  59         C ecx   dst
  60         C edx
  61
  62         mull    %eax
  63
  64         movl    %eax, (%ecx)
  65         movl    %edx, 4(%ecx)
  66
  67         ret
  68
  69 C -----------------------------------------------------------------------------
  70         ALIGN(8)
  71 L(two_limbs):
  72         C eax   src
  73         C ebx
  74         C ecx   dst
  75         C edx   size
  76
  77         pushl   %ebp
  78         pushl   %edi
  79
  80         pushl   %esi
  81         pushl   %ebx
  82
  83         movl    %eax, %ebx
  84         movl    (%eax), %eax
  85
  86         mull    %eax            C src[0]^2
  87
  88         movl    %eax, (%ecx)    C dst[0]
  89         movl    %edx, %esi      C dst[1]
  90
  91         movl    4(%ebx), %eax
  92
  93         mull    %eax            C src[1]^2
  94
  95         movl    %eax, %edi      C dst[2]
  96         movl    %edx, %ebp      C dst[3]
  97
  98         movl    (%ebx), %eax
  99
 100         mull    4(%ebx)         C src[0]*src[1]
 101
 102         addl    %eax, %esi
 103         popl    %ebx
 104
 105         adcl    %edx, %edi
 106
 107         adcl    $0, %ebp
 108         addl    %esi, %eax
 109
 110         adcl    %edi, %edx
 111         movl    %eax, 4(%ecx)
 112
 113         adcl    $0, %ebp
 114         popl    %esi
 115
 116         movl    %edx, 8(%ecx)
 117         movl    %ebp, 12(%ecx)
 118
 119         popl    %edi
 120         popl    %ebp
 121
 122         ret
 123
 124
 125 C -----------------------------------------------------------------------------
 126         ALIGN(8)
 127 L(three_or_more):
 128         C eax   src low limb
 129         C ebx
 130         C ecx   dst
 131         C edx   size
 132
 133         cmpl    $4, %edx
 134         pushl   %ebx
 135 deflit(`FRAME',4)
 136
 137         movl    PARAM_SRC, %ebx
 138         jae     L(four_or_more)
 139
 140
 141 C -----------------------------------------------------------------------------
 142 C three limbs
 143         C eax   src low limb
 144         C ebx   src
 145         C ecx   dst
 146         C edx   size
 147
 148         pushl   %ebp
 149         pushl   %edi
 150
 151         mull    %eax            C src[0] ^ 2
 152
 153         movl    %eax, (%ecx)
 154         movl    %edx, 4(%ecx)
 155
 156         movl    4(%ebx), %eax
 157         xorl    %ebp, %ebp
 158
 159         mull    %eax            C src[1] ^ 2
 160
 161         movl    %eax, 8(%ecx)
 162         movl    %edx, 12(%ecx)
 163
 164         movl    8(%ebx), %eax
 165         pushl   %esi            C risk of cache bank clash
 166
 167         mull    %eax            C src[2] ^ 2
 168
 169         movl    %eax, 16(%ecx)
 170         movl    %edx, 20(%ecx)
 171
 172         movl    (%ebx), %eax
 173
 174         mull    4(%ebx)         C src[0] * src[1]
 175
 176         movl    %eax, %esi
 177         movl    %edx, %edi
 178
 179         movl    (%ebx), %eax
 180
 181         mull    8(%ebx)         C src[0] * src[2]
 182
 183         addl    %eax, %edi
 184         movl    %edx, %ebp
 185
 186         adcl    $0, %ebp
 187         movl    4(%ebx), %eax
 188
 189         mull    8(%ebx)         C src[1] * src[2]
 190
 191         xorl    %ebx, %ebx
 192         addl    %eax, %ebp
 193
 194         C eax
 195         C ebx   zero, will be dst[5]
 196         C ecx   dst
 197         C edx   dst[4]
 198         C esi   dst[1]
 199         C edi   dst[2]
 200         C ebp   dst[3]
 201
 202         adcl    $0, %edx
 203         addl    %esi, %esi
 204
 205         adcl    %edi, %edi
 206
 207         adcl    %ebp, %ebp
 208
 209         adcl    %edx, %edx
 210         movl    4(%ecx), %eax
 211
 212         adcl    $0, %ebx
 213         addl    %esi, %eax
 214
 215         movl    %eax, 4(%ecx)
 216         movl    8(%ecx), %eax
 217
 218         adcl    %edi, %eax
 219         movl    12(%ecx), %esi
 220
 221         adcl    %ebp, %esi
 222         movl    16(%ecx), %edi
 223
 224         movl    %eax, 8(%ecx)
 225         movl    %esi, 12(%ecx)
 226
 227         adcl    %edx, %edi
 228         popl    %esi
 229
 230         movl    20(%ecx), %eax
 231         movl    %edi, 16(%ecx)
 232
 233         popl    %edi
 234         popl    %ebp
 235
 236         adcl    %ebx, %eax      C no carry out of this
 237         popl    %ebx
 238
 239         movl    %eax, 20(%ecx)
 240
 241         ret
 242
 243
 244 C -----------------------------------------------------------------------------
 245         ALIGN(8)
 246 L(four_or_more):
 247         C eax   src low limb
 248         C ebx   src
 249         C ecx   dst
 250         C edx   size
 251         C esi
 252         C edi
 253         C ebp
 254         C
 255         C First multiply src[0]*src[1..size-1] and store at dst[1..size].
 256
 257 deflit(`FRAME',4)
 258
 259         pushl   %edi
 260 FRAME_pushl()
 261         pushl   %esi
 262 FRAME_pushl()
 263
 264         pushl   %ebp
 265 FRAME_pushl()
 266         leal    (%ecx,%edx,4), %edi     C dst end of this mul1
 267
 268         leal    (%ebx,%edx,4), %esi     C src end
 269         movl    %ebx, %ebp              C src
 270
 271         negl    %edx                    C -size
 272         xorl    %ebx, %ebx              C clear carry limb and carry flag
 273
 274         leal    1(%edx), %ecx           C -(size-1)
 275
 276 L(mul1):
 277         C eax   scratch
 278         C ebx   carry
 279         C ecx   counter, negative
 280         C edx   scratch
 281         C esi   &src[size]
 282         C edi   &dst[size]
 283         C ebp   src
 284
 285         adcl    $0, %ebx
 286         movl    (%esi,%ecx,4), %eax
 287
 288         mull    (%ebp)
 289
 290         addl    %eax, %ebx
 291
 292         movl    %ebx, (%edi,%ecx,4)
 293         incl    %ecx
 294
 295         movl    %edx, %ebx
 296         jnz     L(mul1)
 297
 298
 299         C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
 300         C n=1..size-2.
 301         C
 302         C The last two products, which are the end corner of the product
 303         C triangle, are handled separately to save looping overhead.  These
 304         C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
 305         C If size is 4 then it's only these that need to be done.
 306         C
 307         C In the outer loop %esi is a constant, and %edi just advances by 1
 308         C limb each time.  The size of the operation decreases by 1 limb
 309         C each time.
 310
 311         C eax
 312         C ebx   carry (needing carry flag added)
 313         C ecx
 314         C edx
 315         C esi   &src[size]
 316         C edi   &dst[size]
 317         C ebp
 318
 319         adcl    $0, %ebx
 320         movl    PARAM_SIZE, %edx
 321
 322         movl    %ebx, (%edi)
 323         subl    $4, %edx
 324
 325         negl    %edx
 326         jz      L(corner)
 327
 328
 329 L(outer):
 330         C ebx   previous carry limb to store
 331         C edx   outer loop counter (negative)
 332         C esi   &src[size]
 333         C edi   dst, pointing at stored carry limb of previous loop
 334
 335         pushl   %edx                    C new outer loop counter
 336         leal    -2(%edx), %ecx
 337
 338         movl    %ebx, (%edi)
 339         addl    $4, %edi
 340
 341         addl    $4, %ebp
 342         xorl    %ebx, %ebx              C initial carry limb, clear carry flag
 343
 344 L(inner):
 345         C eax   scratch
 346         C ebx   carry (needing carry flag added)
 347         C ecx   counter, negative
 348         C edx   scratch
 349         C esi   &src[size]
 350         C edi   dst end of this addmul
 351         C ebp   &src[j]
 352
 353         adcl    $0, %ebx
 354         movl    (%esi,%ecx,4), %eax
 355
 356         mull    (%ebp)
 357
 358         addl    %ebx, %eax
 359         movl    (%edi,%ecx,4), %ebx
 360
 361         adcl    $0, %edx
 362         addl    %eax, %ebx
 363
 364         movl    %ebx, (%edi,%ecx,4)
 365         incl    %ecx
 366
 367         movl    %edx, %ebx
 368         jnz     L(inner)
 369
 370
 371         adcl    $0, %ebx
 372         popl    %edx            C outer loop counter
 373
 374         incl    %edx
 375         jnz     L(outer)
 376
 377
 378         movl    %ebx, (%edi)
 379
 380 L(corner):
 381         C esi   &src[size]
 382         C edi   &dst[2*size-4]
 383
 384         movl    -8(%esi), %eax
 385         movl    -4(%edi), %ebx          C risk of data cache bank clash here
 386
 387         mull    -12(%esi)               C src[size-2]*src[size-3]
 388
 389         addl    %eax, %ebx
 390         movl    %edx, %ecx
 391
 392         adcl    $0, %ecx
 393         movl    -4(%esi), %eax
 394
 395         mull    -12(%esi)               C src[size-1]*src[size-3]
 396
 397         addl    %ecx, %eax
 398         movl    (%edi), %ecx
 399
 400         adcl    $0, %edx
 401         movl    %ebx, -4(%edi)
 402
 403         addl    %eax, %ecx
 404         movl    %edx, %ebx
 405
 406         adcl    $0, %ebx
 407         movl    -4(%esi), %eax
 408
 409         mull    -8(%esi)                C src[size-1]*src[size-2]
 410
 411         movl    %ecx, (%edi)
 412         addl    %eax, %ebx
 413
 414         adcl    $0, %edx
 415         movl    PARAM_SIZE, %eax
 416
 417         negl    %eax
 418         movl    %ebx, 4(%edi)
 419
 420         addl    $1, %eax                C -(size-1) and clear carry
 421         movl    %edx, 8(%edi)
 422
 423
 424 C -----------------------------------------------------------------------------
 425 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
 426
 427 L(lshift):
 428         C eax   counter, negative
 429         C ebx   next limb
 430         C ecx
 431         C edx
 432         C esi
 433         C edi   &dst[2*size-4]
 434         C ebp
 435
 436         movl    12(%edi,%eax,8), %ebx
 437
 438         rcll    %ebx
 439         movl    16(%edi,%eax,8), %ecx
 440
 441         rcll    %ecx
 442         movl    %ebx, 12(%edi,%eax,8)
 443
 444         movl    %ecx, 16(%edi,%eax,8)
 445         incl    %eax
 446
 447         jnz     L(lshift)
 448
 449
 450         adcl    %eax, %eax              C high bit out
 451         movl    PARAM_SRC, %esi
 452
 453         movl    PARAM_SIZE, %ecx        C risk of cache bank clash
 454         movl    %eax, 12(%edi)          C dst most significant limb
 455
 456
 457 C -----------------------------------------------------------------------------
 458 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
 459 C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
 460 C low limb of src[0]^2.
 461
 462         movl    (%esi), %eax            C src[0]
 463         leal    (%esi,%ecx,4), %esi     C src end
 464
 465         negl    %ecx
 466
 467         mull    %eax
 468
 469         movl    %eax, 16(%edi,%ecx,8)   C dst[0]
 470         movl    %edx, %ebx
 471
 472         addl    $1, %ecx                C size-1 and clear carry
 473
 474 L(diag):
 475         C eax   scratch (low product)
 476         C ebx   carry limb
 477         C ecx   counter, negative
 478         C edx   scratch (high product)
 479         C esi   &src[size]
 480         C edi   &dst[2*size-4]
 481         C ebp   scratch (fetched dst limbs)
 482
 483         movl    (%esi,%ecx,4), %eax
 484         adcl    $0, %ebx
 485
 486         mull    %eax
 487
 488         movl    16-4(%edi,%ecx,8), %ebp
 489
 490         addl    %ebp, %ebx
 491         movl    16(%edi,%ecx,8), %ebp
 492
 493         adcl    %eax, %ebp
 494         movl    %ebx, 16-4(%edi,%ecx,8)
 495
 496         movl    %ebp, 16(%edi,%ecx,8)
 497         incl    %ecx
 498
 499         movl    %edx, %ebx
 500         jnz     L(diag)
 501
 502
 503         adcl    $0, %edx
 504         movl    16-4(%edi), %eax        C dst most significant limb
 505
 506         addl    %eax, %edx
 507         popl    %ebp
 508
 509         movl    %edx, 16-4(%edi)
 510         popl    %esi            C risk of cache bank clash
 511
 512         popl    %edi
 513         popl    %ebx
 514
 515         ret
 516
 517 EPILOGUE()