mpn/sparc32/v9/submul_1.asm

   1 dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
   2 dnl  subtract the result from a second limb vector.
   3
   4 dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of the GNU Lesser General Public License as published
  10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  11 dnl  your option) any later version.
  12
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 dnl  License for more details.
  17
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23 C Algorithm: We use two floating-point multiplies per limb product, with the
  24 C invariant v operand split into two 16-bit pieces, and the u operand split
  25 C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
  26 C the integer unit.
  27
  28 C                  cycles/limb
  29 C UltraSPARC 1&2:     6.5
  30 C UltraSPARC 3:       ?
  31
  32 C Possible optimizations:
  33 C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
  34 C      memory bandwidth limited, this could save 1.5 cycles/limb.
  35 C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
  36 C      it is very straightforward to unroll, using an exit branch midways.
  37 C      Unrolling would allow deeper scheduling which could improve speed for L2
  38 C      cache case.
  39 C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
  40 C      aren't sufficiently apart-scheduled with just two temp areas.
  41 C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
  42 C      could save many operations.
  43
  44 C INPUT PARAMETERS
  45 C rp    i0
  46 C up    i1
  47 C n     i2
  48 C v     i3
  49
  50 define(`FSIZE',224)
  51
  52 ASM_START()
  53 PROLOGUE(mpn_submul_1)
  54         add     %sp, -FSIZE, %sp
  55         sethi   %hi(0xffff), %g1
  56         srl     %o3, 16, %g2
  57         or      %g1, %lo(0xffff), %g1
  58         and     %o3, %g1, %g1
  59         stx     %g1, [%sp+104]
  60         stx     %g2, [%sp+112]
  61         ldd     [%sp+104], %f6
  62         ldd     [%sp+112], %f8
  63         fxtod   %f6, %f6
  64         fxtod   %f8, %f8
  65         ld      [%sp+104], %f10         C zero f10
  66
  67         mov     0, %g3                  C cy = 0
  68
  69 define(`fanop', `fitod %f18, %f0')      C  A quasi nop running in the FA pipe
  70
  71         add     %sp, 160, %o5           C point in scratch area
  72         and     %o5, -32, %o5           C align at 0 (mod 32) in scratch area
  73
  74         subcc   %o2, 1, %o2
  75         ld      [%o1], %f11             C read up[i]
  76         add     %o1, 4, %o1             C up++
  77         bne,pt  %icc, .L_two_or_more
  78         fxtod   %f10, %f2
  79
  80         fmuld   %f2, %f8, %f16
  81         fmuld   %f2, %f6, %f4
  82         fdtox   %f16, %f14
  83         fdtox   %f4, %f12
  84         std     %f14, [%o5+16]
  85         std     %f12, [%o5+24]
  86         ldx     [%o5+16], %g2           C p16
  87         ldx     [%o5+24], %g1           C p0
  88         lduw    [%o0], %g5              C read rp[i]
  89         b       .L1
  90         add     %o0, -16, %o0
  91
  92         .align  16
  93 .L_two_or_more:
  94         subcc   %o2, 1, %o2
  95         ld      [%o1], %f11             C read up[i]
  96         fmuld   %f2, %f8, %f16
  97         fmuld   %f2, %f6, %f4
  98         add     %o1, 4, %o1             C up++
  99         bne,pt  %icc, .L_three_or_more
 100         fxtod   %f10, %f2
 101
 102         fdtox   %f16, %f14
 103         fdtox   %f4, %f12
 104         std     %f14, [%o5+16]
 105         fmuld   %f2, %f8, %f16
 106         std     %f12, [%o5+24]
 107         fmuld   %f2, %f6, %f4
 108         fdtox   %f16, %f14
 109         fdtox   %f4, %f12
 110         std     %f14, [%o5+0]
 111         std     %f12, [%o5+8]
 112         lduw    [%o0], %g5              C read rp[i]
 113         ldx     [%o5+16], %g2           C p16
 114         ldx     [%o5+24], %g1           C p0
 115         b       .L2
 116         add     %o0, -12, %o0
 117
 118         .align  16
 119 .L_three_or_more:
 120         subcc   %o2, 1, %o2
 121         ld      [%o1], %f11             C read up[i]
 122         fdtox   %f16, %f14
 123         fdtox   %f4, %f12
 124         std     %f14, [%o5+16]
 125         fmuld   %f2, %f8, %f16
 126         std     %f12, [%o5+24]
 127         fmuld   %f2, %f6, %f4
 128         add     %o1, 4, %o1             C up++
 129         bne,pt  %icc, .L_four_or_more
 130         fxtod   %f10, %f2
 131
 132         fdtox   %f16, %f14
 133         fdtox   %f4, %f12
 134         std     %f14, [%o5+0]
 135         fmuld   %f2, %f8, %f16
 136         std     %f12, [%o5+8]
 137         fmuld   %f2, %f6, %f4
 138         fdtox   %f16, %f14
 139         ldx     [%o5+16], %g2           C p16
 140         fdtox   %f4, %f12
 141         ldx     [%o5+24], %g1           C p0
 142         std     %f14, [%o5+16]
 143         std     %f12, [%o5+24]
 144         lduw    [%o0], %g5              C read rp[i]
 145         b       .L3
 146         add     %o0, -8, %o0
 147
 148         .align  16
 149 .L_four_or_more:
 150         subcc   %o2, 1, %o2
 151         ld      [%o1], %f11             C read up[i]
 152         fdtox   %f16, %f14
 153         fdtox   %f4, %f12
 154         std     %f14, [%o5+0]
 155         fmuld   %f2, %f8, %f16
 156         std     %f12, [%o5+8]
 157         fmuld   %f2, %f6, %f4
 158         add     %o1, 4, %o1             C up++
 159         bne,pt  %icc, .L_five_or_more
 160         fxtod   %f10, %f2
 161
 162         fdtox   %f16, %f14
 163         ldx     [%o5+16], %g2           C p16
 164         fdtox   %f4, %f12
 165         ldx     [%o5+24], %g1           C p0
 166         std     %f14, [%o5+16]
 167         fmuld   %f2, %f8, %f16
 168         std     %f12, [%o5+24]
 169         fmuld   %f2, %f6, %f4
 170         add     %o1, 4, %o1             C up++
 171         lduw    [%o0], %g5              C read rp[i]
 172         b       .L4
 173         add     %o0, -4, %o0
 174
 175         .align  16
 176 .L_five_or_more:
 177         subcc   %o2, 1, %o2
 178         ld      [%o1], %f11             C read up[i]
 179         fdtox   %f16, %f14
 180         ldx     [%o5+16], %g2           C p16
 181         fdtox   %f4, %f12
 182         ldx     [%o5+24], %g1           C p0
 183         std     %f14, [%o5+16]
 184         fmuld   %f2, %f8, %f16
 185         std     %f12, [%o5+24]
 186         fmuld   %f2, %f6, %f4
 187         add     %o1, 4, %o1             C up++
 188         lduw    [%o0], %g5              C read rp[i]
 189         bne,pt  %icc, .Loop
 190         fxtod   %f10, %f2
 191         b,a     .L5
 192
 193 C BEGIN MAIN LOOP
 194         .align 16
 195 C -- 0
 196 .Loop:  sub     %g0, %g3, %g3
 197         subcc   %o2, 1, %o2
 198         ld      [%o1], %f11             C read up[i]
 199         fdtox   %f16, %f14
 200 C -- 1
 201         sllx    %g2, 16, %g4            C (p16 << 16)
 202         add     %o0, 4, %o0             C rp++
 203         ldx     [%o5+0], %g2            C p16
 204         fdtox   %f4, %f12
 205 C -- 2
 206         srl     %g3, 0, %g3             C zero most significant 32 bits
 207         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 208         ldx     [%o5+8], %g1            C p0
 209         fanop
 210 C -- 3
 211         nop
 212         add     %g3, %g4, %g4           C p += cy
 213         std     %f14, [%o5+0]
 214         fmuld   %f2, %f8, %f16
 215 C -- 4
 216         nop
 217         sub     %g5, %g4, %g4           C p += rp[i]
 218         std     %f12, [%o5+8]
 219         fmuld   %f2, %f6, %f4
 220 C -- 5
 221         xor     %o5, 16, %o5            C alternate scratch variables
 222         add     %o1, 4, %o1             C up++
 223         stw     %g4, [%o0-4]
 224         fanop
 225 C -- 6
 226         srlx    %g4, 32, %g3            C new cy
 227         lduw    [%o0], %g5              C read rp[i]
 228         bne,pt  %icc, .Loop
 229         fxtod   %f10, %f2
 230 C END MAIN LOOP
 231
 232 .L5:    sub     %g0, %g3, %g3
 233         fdtox   %f16, %f14
 234         sllx    %g2, 16, %g4            C (p16 << 16)
 235         ldx     [%o5+0], %g2            C p16
 236         fdtox   %f4, %f12
 237         srl     %g3, 0, %g3             C zero most significant 32 bits
 238         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 239         ldx     [%o5+8], %g1            C p0
 240         add     %g4, %g3, %g4           C p += cy
 241         std     %f14, [%o5+0]
 242         fmuld   %f2, %f8, %f16
 243         sub     %g5, %g4, %g4           C p += rp[i]
 244         std     %f12, [%o5+8]
 245         fmuld   %f2, %f6, %f4
 246         xor     %o5, 16, %o5
 247         stw     %g4, [%o0+0]
 248         srlx    %g4, 32, %g3            C new cy
 249         lduw    [%o0+4], %g5            C read rp[i]
 250
 251         sub     %g0, %g3, %g3
 252 .L4:    fdtox   %f16, %f14
 253         sllx    %g2, 16, %g4            C (p16 << 16)
 254         ldx     [%o5+0], %g2            C p16
 255         fdtox   %f4, %f12
 256         srl     %g3, 0, %g3             C zero most significant 32 bits
 257         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 258         ldx     [%o5+8], %g1            C p0
 259         add     %g3, %g4, %g4           C p += cy
 260         std     %f14, [%o5+0]
 261         sub     %g5, %g4, %g4           C p += rp[i]
 262         std     %f12, [%o5+8]
 263         xor     %o5, 16, %o5
 264         stw     %g4, [%o0+4]
 265         srlx    %g4, 32, %g3            C new cy
 266         lduw    [%o0+8], %g5            C read rp[i]
 267
 268         sub     %g0, %g3, %g3
 269 .L3:    sllx    %g2, 16, %g4            C (p16 << 16)
 270         ldx     [%o5+0], %g2            C p16
 271         srl     %g3, 0, %g3             C zero most significant 32 bits
 272         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 273         ldx     [%o5+8], %g1            C p0
 274         add     %g3, %g4, %g4           C p += cy
 275         sub     %g5, %g4, %g4           C p += rp[i]
 276         xor     %o5, 16, %o5
 277         stw     %g4, [%o0+8]
 278         srlx    %g4, 32, %g3            C new cy
 279         lduw    [%o0+12], %g5           C read rp[i]
 280
 281         sub     %g0, %g3, %g3
 282 .L2:    sllx    %g2, 16, %g4            C (p16 << 16)
 283         ldx     [%o5+0], %g2            C p16
 284         srl     %g3, 0, %g3             C zero most significant 32 bits
 285         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 286         ldx     [%o5+8], %g1            C p0
 287         add     %g3, %g4, %g4           C p += cy
 288         sub     %g5, %g4, %g4           C p += rp[i]
 289         stw     %g4, [%o0+12]
 290         srlx    %g4, 32, %g3            C new cy
 291         lduw    [%o0+16], %g5           C read rp[i]
 292
 293         sub     %g0, %g3, %g3
 294 .L1:    sllx    %g2, 16, %g4            C (p16 << 16)
 295         srl     %g3, 0, %g3             C zero most significant 32 bits
 296         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 297         add     %g3, %g4, %g4           C p += cy
 298         sub     %g5, %g4, %g4           C p += rp[i]
 299         stw     %g4, [%o0+16]
 300         srlx    %g4, 32, %g3            C new cy
 301
 302         sub     %g0, %g3, %o0
 303         retl
 304         sub     %sp, -FSIZE, %sp
 305 EPILOGUE(mpn_submul_1)