mpn/sparc32/v9/sqr_diagonal.asm

   1 dnl  SPARC v9 32-bit mpn_sqr_diagonal.
   2
   3 dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of the GNU Lesser General Public License as published
   9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  10 dnl  your option) any later version.
  11
  12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 dnl  License for more details.
  16
  17 dnl  You should have received a copy of the GNU Lesser General Public License
  18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  19
  20
  21 include(`../config.m4')
  22
  23 C INPUT PARAMETERS
  24 C rp    i0
  25 C up    i1
  26 C n     i2
  27
  28 C This code uses a very deep software pipeline, due to the need for moving data
  29 C forth and back between the integer registers and floating-point registers.
  30 C
  31 C A VIS variant of this code would make the pipeline less deep, since the
  32 C masking now done in the integer unit could take place in the floating-point
  33 C unit using the FAND instruction.  It would be possible to save several cycles
  34 C too.
  35 C
  36 C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
  37 C not much slower from the Ecache.  It would perhaps be possible to shave off
  38 C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
  39 C used instructions, since we have 10 memory operations per limb.  But a VIS
  40 C variant could run three cycles faster than the corresponding non-VIS code.
  41
  42 C This is non-pipelined code showing the algorithm:
  43 C
  44 C .Loop:
  45 C       lduw    [up+0],%g4              C 00000000hhhhllll
  46 C       sllx    %g4,16,%g3              C 0000hhhhllll0000
  47 C       or      %g3,%g4,%g2             C 0000hhhhXXXXllll
  48 C       andn    %g2,%g5,%g2             C 0000hhhh0000llll
  49 C       stx     %g2,[%fp+80]
  50 C       ldd     [%fp+80],%f0
  51 C       fitod   %f0,%f4                 C hi16
  52 C       fitod   %f1,%f6                 C lo16
  53 C       ld      [up+0],%f9
  54 C       fxtod   %f8,%f2
  55 C       fmuld   %f2,%f4,%f4
  56 C       fmuld   %f2,%f6,%f6
  57 C       fdtox   %f4,%f4
  58 C       fdtox   %f6,%f6
  59 C       std     %f4,[%fp-24]
  60 C       std     %f6,[%fp-16]
  61 C       ldx     [%fp-24],%g2
  62 C       ldx     [%fp-16],%g1
  63 C       sllx    %g2,16,%g2
  64 C       add     %g2,%g1,%g1
  65 C       stw     %g1,[rp+0]
  66 C       srlx    %g1,32,%l0
  67 C       stw     %l0,[rp+4]
  68 C       add     up,4,up
  69 C       subcc   n,1,n
  70 C       bne,pt  %icc,.Loop
  71 C       add     rp,8,rp
  72
  73 define(`fanop',`fitod %f12,%f10')       dnl  A quasi nop running in the FA pipe
  74
  75 ASM_START()
  76
  77         TEXT
  78         ALIGN(4)
  79 .Lnoll:
  80         .word   0
  81
  82 PROLOGUE(mpn_sqr_diagonal)
  83         save    %sp,-256,%sp
  84
  85 ifdef(`PIC',
  86 `.Lpc:  rd      %pc,%o7
  87         ld      [%o7+.Lnoll-.Lpc],%f8',
  88 `       sethi   %hi(.Lnoll),%g1
  89         ld      [%g1+%lo(.Lnoll)],%f8')
  90
  91         sethi   %hi(0xffff0000),%g5
  92         add     %i1,-8,%i1
  93
  94         lduw    [%i1+8],%g4
  95         add     %i1,4,%i1               C s1_ptr++
  96         sllx    %g4,16,%g3              C 0000hhhhllll0000
  97         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
  98         subcc   %i2,1,%i2
  99         bne,pt  %icc,.L_grt_1
 100         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 101
 102         add     %i1,4,%i1               C s1_ptr++
 103         stx     %g2,[%fp+80]
 104         ld      [%i1],%f9
 105         ldd     [%fp+80],%f0
 106         fxtod   %f8,%f2
 107         fitod   %f0,%f4
 108         fitod   %f1,%f6
 109         fmuld   %f2,%f4,%f4
 110         fmuld   %f2,%f6,%f6
 111         fdtox   %f4,%f4
 112         fdtox   %f6,%f6
 113         std     %f4,[%fp-24]
 114         std     %f6,[%fp-16]
 115
 116         add     %fp, 80, %l3
 117         add     %fp, -24, %l4
 118         add     %fp, 72, %l5
 119         b       .L1
 120         add     %fp, -40, %l6
 121
 122 .L_grt_1:
 123         stx     %g2,[%fp+80]
 124         lduw    [%i1+8],%g4
 125         add     %i1,4,%i1               C s1_ptr++
 126         sllx    %g4,16,%g3              C 0000hhhhllll0000
 127         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 128         subcc   %i2,1,%i2
 129         bne,pt  %icc,.L_grt_2
 130         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 131
 132         stx     %g2,[%fp+72]
 133         ld      [%i1],%f9
 134         add     %i1,4,%i1               C s1_ptr++
 135         ldd     [%fp+80],%f0
 136         fxtod   %f8,%f2
 137         fitod   %f0,%f4
 138         fitod   %f1,%f6
 139         fmuld   %f2,%f4,%f4
 140         ld      [%i1],%f9
 141         fmuld   %f2,%f6,%f6
 142         ldd     [%fp+72],%f0
 143         fdtox   %f4,%f4
 144         fdtox   %f6,%f6
 145         std     %f4,[%fp-24]
 146         fxtod   %f8,%f2
 147         std     %f6,[%fp-16]
 148         fitod   %f0,%f4
 149         fitod   %f1,%f6
 150         fmuld   %f2,%f4,%f4
 151         fmuld   %f2,%f6,%f6
 152         fdtox   %f4,%f4
 153
 154         add     %fp, 72, %l3
 155         add     %fp, -40, %l4
 156         add     %fp, 80, %l5
 157         b       .L2
 158         add     %fp, -24, %l6
 159
 160 .L_grt_2:
 161         stx     %g2,[%fp+72]
 162         lduw    [%i1+8],%g4
 163         ld      [%i1],%f9
 164         add     %i1,4,%i1               C s1_ptr++
 165         ldd     [%fp+80],%f0
 166         sllx    %g4,16,%g3              C 0000hhhhllll0000
 167         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 168         subcc   %i2,1,%i2
 169         fxtod   %f8,%f2
 170         bne,pt  %icc,.L_grt_3
 171         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 172
 173         stx     %g2,[%fp+80]
 174         fitod   %f0,%f4
 175         fitod   %f1,%f6
 176         fmuld   %f2,%f4,%f4
 177         ld      [%i1],%f9
 178         fmuld   %f2,%f6,%f6
 179         add     %i1,4,%i1               C s1_ptr++
 180         ldd     [%fp+72],%f0
 181         fdtox   %f4,%f4
 182         fdtox   %f6,%f6
 183         std     %f4,[%fp-24]
 184         fxtod   %f8,%f2
 185         std     %f6,[%fp-16]
 186         fitod   %f0,%f4
 187         fitod   %f1,%f6
 188         fmuld   %f2,%f4,%f4
 189         ld      [%i1],%f9
 190         add     %fp, 80, %l3
 191         fmuld   %f2,%f6,%f6
 192         add     %fp, -24, %l4
 193         ldd     [%fp+80],%f0
 194         add     %fp, 72, %l5
 195         fdtox   %f4,%f4
 196         b       .L3
 197         add     %fp, -40, %l6
 198
 199 .L_grt_3:
 200         stx     %g2,[%fp+80]
 201         fitod   %f0,%f4
 202         lduw    [%i1+8],%g4
 203         fitod   %f1,%f6
 204         fmuld   %f2,%f4,%f4
 205         ld      [%i1],%f9
 206         fmuld   %f2,%f6,%f6
 207         add     %i1,4,%i1               C s1_ptr++
 208         ldd     [%fp+72],%f0
 209         fdtox   %f4,%f4
 210         sllx    %g4,16,%g3              C 0000hhhhllll0000
 211         fdtox   %f6,%f6
 212         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 213         subcc   %i2,1,%i2
 214         std     %f4,[%fp-24]
 215         fxtod   %f8,%f2
 216         std     %f6,[%fp-16]
 217         bne,pt  %icc,.L_grt_4
 218         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 219
 220         stx     %g2,[%fp+72]
 221         fitod   %f0,%f4
 222         fitod   %f1,%f6
 223         add     %fp, 72, %l3
 224         fmuld   %f2,%f4,%f4
 225         add     %fp, -40, %l4
 226         ld      [%i1],%f9
 227         fmuld   %f2,%f6,%f6
 228         add     %i1,4,%i1               C s1_ptr++
 229         ldd     [%fp+80],%f0
 230         add     %fp, 80, %l5
 231         fdtox   %f4,%f4
 232         b       .L4
 233         add     %fp, -24, %l6
 234
 235 .L_grt_4:
 236         stx     %g2,[%fp+72]
 237         fitod   %f0,%f4
 238         lduw    [%i1+8],%g4
 239         fitod   %f1,%f6
 240         fmuld   %f2,%f4,%f4
 241         ld      [%i1],%f9
 242         fmuld   %f2,%f6,%f6
 243         add     %i1,4,%i1               C s1_ptr++
 244         ldd     [%fp+80],%f0
 245         fdtox   %f4,%f4
 246         sllx    %g4,16,%g3              C 0000hhhhllll0000
 247         fdtox   %f6,%f6
 248         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 249         subcc   %i2,1,%i2
 250         std     %f4,[%fp-40]
 251         fxtod   %f8,%f2
 252         std     %f6,[%fp-32]
 253         be,pn   %icc,.L5
 254         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 255
 256         b,a     .Loop
 257
 258         .align  16
 259 C --- LOOP BEGIN
 260 .Loop:  nop
 261         nop
 262         stx     %g2,[%fp+80]
 263         fitod   %f0,%f4
 264 C ---
 265         nop
 266         nop
 267         lduw    [%i1+8],%g4
 268         fitod   %f1,%f6
 269 C ---
 270         nop
 271         nop
 272         ldx     [%fp-24],%g2            C p16
 273         fanop
 274 C ---
 275         nop
 276         nop
 277         ldx     [%fp-16],%g1            C p0
 278         fmuld   %f2,%f4,%f4
 279 C ---
 280         sllx    %g2,16,%g2              C align p16
 281         add     %i0,8,%i0               C res_ptr++
 282         ld      [%i1],%f9
 283         fmuld   %f2,%f6,%f6
 284 C ---
 285         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 286         add     %i1,4,%i1               C s1_ptr++
 287         ldd     [%fp+72],%f0
 288         fanop
 289 C ---
 290         srlx    %g1,32,%l0
 291         nop
 292         stw     %g1,[%i0-8]
 293         fdtox   %f4,%f4
 294 C ---
 295         sllx    %g4,16,%g3              C 0000hhhhllll0000
 296         nop
 297         stw     %l0,[%i0-4]
 298         fdtox   %f6,%f6
 299 C ---
 300         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 301         subcc   %i2,1,%i2
 302         std     %f4,[%fp-24]
 303         fxtod   %f8,%f2
 304 C ---
 305         std     %f6,[%fp-16]
 306         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 307         be,pn   %icc,.Lend
 308         fanop
 309 C ---  LOOP MIDDLE
 310         nop
 311         nop
 312         stx     %g2,[%fp+72]
 313         fitod   %f0,%f4
 314 C ---
 315         nop
 316         nop
 317         lduw    [%i1+8],%g4
 318         fitod   %f1,%f6
 319 C ---
 320         nop
 321         nop
 322         ldx     [%fp-40],%g2            C p16
 323         fanop
 324 C ---
 325         nop
 326         nop
 327         ldx     [%fp-32],%g1            C p0
 328         fmuld   %f2,%f4,%f4
 329 C ---
 330         sllx    %g2,16,%g2              C align p16
 331         add     %i0,8,%i0               C res_ptr++
 332         ld      [%i1],%f9
 333         fmuld   %f2,%f6,%f6
 334 C ---
 335         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 336         add     %i1,4,%i1               C s1_ptr++
 337         ldd     [%fp+80],%f0
 338         fanop
 339 C ---
 340         srlx    %g1,32,%l0
 341         nop
 342         stw     %g1,[%i0-8]
 343         fdtox   %f4,%f4
 344 C ---
 345         sllx    %g4,16,%g3              C 0000hhhhllll0000
 346         nop
 347         stw     %l0,[%i0-4]
 348         fdtox   %f6,%f6
 349 C ---
 350         or      %g3,%g4,%g2             C 0000hhhhXXXXllll
 351         subcc   %i2,1,%i2
 352         std     %f4,[%fp-40]
 353         fxtod   %f8,%f2
 354 C ---
 355         std     %f6,[%fp-32]
 356         andn    %g2,%g5,%g2             C 0000hhhh0000llll
 357         bne,pt  %icc,.Loop
 358         fanop
 359 C --- LOOP END
 360
 361 .L5:    add     %fp, 80, %l3
 362         add     %fp, -24, %l4
 363         add     %fp, 72, %l5
 364         b       .Ltail
 365         add     %fp, -40, %l6
 366
 367 .Lend:  add     %fp, 72, %l3
 368         add     %fp, -40, %l4
 369         add     %fp, 80, %l5
 370         add     %fp, -24, %l6
 371 .Ltail: stx     %g2,[%l3]
 372         fitod   %f0,%f4
 373         fitod   %f1,%f6
 374         ldx     [%l4],%g2               C p16
 375         ldx     [%l4+8],%g1             C p0
 376         fmuld   %f2,%f4,%f4
 377         sllx    %g2,16,%g2              C align p16
 378         add     %i0,8,%i0               C res_ptr++
 379         ld      [%i1],%f9
 380         fmuld   %f2,%f6,%f6
 381         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 382         add     %i1,4,%i1               C s1_ptr++
 383         ldd     [%l5],%f0
 384         srlx    %g1,32,%l0
 385         stw     %g1,[%i0-8]
 386         fdtox   %f4,%f4
 387         stw     %l0,[%i0-4]
 388 .L4:    fdtox   %f6,%f6
 389         std     %f4,[%l4]
 390         fxtod   %f8,%f2
 391         std     %f6,[%l4+8]
 392
 393         fitod   %f0,%f4
 394         fitod   %f1,%f6
 395         ldx     [%l6],%g2               C p16
 396         ldx     [%l6+8],%g1             C p0
 397         fmuld   %f2,%f4,%f4
 398         sllx    %g2,16,%g2              C align p16
 399         add     %i0,8,%i0               C res_ptr++
 400         ld      [%i1],%f9
 401         fmuld   %f2,%f6,%f6
 402         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 403         ldd     [%l3],%f0
 404         srlx    %g1,32,%l0
 405         stw     %g1,[%i0-8]
 406         fdtox   %f4,%f4
 407         stw     %l0,[%i0-4]
 408 .L3:    fdtox   %f6,%f6
 409         std     %f4,[%l6]
 410         fxtod   %f8,%f2
 411         std     %f6,[%l6+8]
 412
 413         fitod   %f0,%f4
 414         fitod   %f1,%f6
 415         ldx     [%l4],%g2               C p16
 416         ldx     [%l4+8],%g1             C p0
 417         fmuld   %f2,%f4,%f4
 418         sllx    %g2,16,%g2              C align p16
 419         add     %i0,8,%i0               C res_ptr++
 420         fmuld   %f2,%f6,%f6
 421         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 422         srlx    %g1,32,%l0
 423         stw     %g1,[%i0-8]
 424         fdtox   %f4,%f4
 425         stw     %l0,[%i0-4]
 426 .L2:    fdtox   %f6,%f6
 427         std     %f4,[%l4]
 428         std     %f6,[%l4+8]
 429
 430         ldx     [%l6],%g2               C p16
 431         ldx     [%l6+8],%g1             C p0
 432         sllx    %g2,16,%g2              C align p16
 433         add     %i0,8,%i0               C res_ptr++
 434         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 435         srlx    %g1,32,%l0
 436         stw     %g1,[%i0-8]
 437         stw     %l0,[%i0-4]
 438
 439 .L1:    ldx     [%l4],%g2               C p16
 440         ldx     [%l4+8],%g1             C p0
 441         sllx    %g2,16,%g2              C align p16
 442         add     %i0,8,%i0               C res_ptr++
 443         add     %g2,%g1,%g1             C add p16 to p0 (ADD1)
 444         srlx    %g1,32,%l0
 445         stw     %g1,[%i0-8]
 446         stw     %l0,[%i0-4]
 447
 448         ret
 449         restore %g0,%g0,%o0
 450
 451 EPILOGUE(mpn_sqr_diagonal)