mpn/sparc32/v9/mul_1.asm

   1 dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
   2 dnl  the result in a second limb vector.
   3
   4 dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
   5
   6 dnl  This file is part of the GNU MP Library.
   7
   8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   9 dnl  it under the terms of the GNU Lesser General Public License as published
  10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
  11 dnl  your option) any later version.
  12
  13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 dnl  License for more details.
  17
  18 dnl  You should have received a copy of the GNU Lesser General Public License
  19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  20
  21 include(`../config.m4')
  22
  23 C Algorithm: We use two floating-point multiplies per limb product, with the
  24 C invariant v operand split into two 16-bit pieces, and the u operand split
  25 C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
  26 C the integer unit.
  27
  28 C                  cycles/limb
  29 C UltraSPARC 1&2:     6.5
  30 C UltraSPARC 3:       ?
  31
  32 C Possible optimizations:
  33 C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
  34 C      memory bandwidth limited, this could save 1.5 cycles/limb.
  35 C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
  36 C      it is very straightforward to unroll, using an exit branch midways.
  37 C      Unrolling would allow deeper scheduling which could improve speed for L2
  38 C      cache case.
  39 C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
  40 C      aren't sufficiently apart-scheduled with just two temp areas.
  41 C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
  42 C      could save many operations.
  43
  44 C INPUT PARAMETERS
  45 C rp    i0
  46 C up    i1
  47 C n     i2
  48 C v     i3
  49
  50 define(`FSIZE',224)
  51
  52 ASM_START()
  53 PROLOGUE(mpn_mul_1)
  54         add     %sp, -FSIZE, %sp
  55         sethi   %hi(0xffff), %g1
  56         srl     %o3, 16, %g2
  57         or      %g1, %lo(0xffff), %g1
  58         and     %o3, %g1, %g1
  59         stx     %g1, [%sp+104]
  60         stx     %g2, [%sp+112]
  61         ldd     [%sp+104], %f6
  62         ldd     [%sp+112], %f8
  63         fxtod   %f6, %f6
  64         fxtod   %f8, %f8
  65         ld      [%sp+104], %f10         C zero f10
  66
  67         mov     0, %g3                  C cy = 0
  68
  69 define(`fanop', `fitod %f18, %f0')      C  A quasi nop running in the FA pipe
  70
  71         add     %sp, 160, %o5           C point in scratch area
  72         and     %o5, -32, %o5           C align at 0 (mod 32) in scratch area
  73
  74         subcc   %o2, 1, %o2
  75         ld      [%o1], %f11             C read up[i]
  76         add     %o1, 4, %o1             C up++
  77         bne,pt  %icc, .L_two_or_more
  78         fxtod   %f10, %f2
  79
  80         fmuld   %f2, %f8, %f16
  81         fmuld   %f2, %f6, %f4
  82         fdtox   %f16, %f14
  83         fdtox   %f4, %f12
  84         std     %f14, [%o5+16]
  85         std     %f12, [%o5+24]
  86         ldx     [%o5+16], %g2           C p16
  87         ldx     [%o5+24], %g1           C p0
  88         b       .L1
  89         add     %o0, -16, %o0
  90
  91         .align  16
  92 .L_two_or_more:
  93         subcc   %o2, 1, %o2
  94         ld      [%o1], %f11             C read up[i]
  95         fmuld   %f2, %f8, %f16
  96         fmuld   %f2, %f6, %f4
  97         add     %o1, 4, %o1             C up++
  98         bne,pt  %icc, .L_three_or_more
  99         fxtod   %f10, %f2
 100
 101         fdtox   %f16, %f14
 102         fdtox   %f4, %f12
 103         std     %f14, [%o5+16]
 104         fmuld   %f2, %f8, %f16
 105         std     %f12, [%o5+24]
 106         fmuld   %f2, %f6, %f4
 107         fdtox   %f16, %f14
 108         fdtox   %f4, %f12
 109         std     %f14, [%o5+0]
 110         std     %f12, [%o5+8]
 111         ldx     [%o5+16], %g2           C p16
 112         ldx     [%o5+24], %g1           C p0
 113         b       .L2
 114         add     %o0, -12, %o0
 115
 116         .align  16
 117 .L_three_or_more:
 118         subcc   %o2, 1, %o2
 119         ld      [%o1], %f11             C read up[i]
 120         fdtox   %f16, %f14
 121         fdtox   %f4, %f12
 122         std     %f14, [%o5+16]
 123         fmuld   %f2, %f8, %f16
 124         std     %f12, [%o5+24]
 125         fmuld   %f2, %f6, %f4
 126         add     %o1, 4, %o1             C up++
 127         bne,pt  %icc, .L_four_or_more
 128         fxtod   %f10, %f2
 129
 130         fdtox   %f16, %f14
 131         fdtox   %f4, %f12
 132         std     %f14, [%o5+0]
 133         fmuld   %f2, %f8, %f16
 134         std     %f12, [%o5+8]
 135         fmuld   %f2, %f6, %f4
 136         fdtox   %f16, %f14
 137         ldx     [%o5+16], %g2           C p16
 138         fdtox   %f4, %f12
 139         ldx     [%o5+24], %g1           C p0
 140         std     %f14, [%o5+16]
 141         std     %f12, [%o5+24]
 142         b       .L3
 143         add     %o0, -8, %o0
 144
 145         .align  16
 146 .L_four_or_more:
 147         subcc   %o2, 1, %o2
 148         ld      [%o1], %f11             C read up[i]
 149         fdtox   %f16, %f14
 150         fdtox   %f4, %f12
 151         std     %f14, [%o5+0]
 152         fmuld   %f2, %f8, %f16
 153         std     %f12, [%o5+8]
 154         fmuld   %f2, %f6, %f4
 155         add     %o1, 4, %o1             C up++
 156         bne,pt  %icc, .L_five_or_more
 157         fxtod   %f10, %f2
 158
 159         fdtox   %f16, %f14
 160         ldx     [%o5+16], %g2           C p16
 161         fdtox   %f4, %f12
 162         ldx     [%o5+24], %g1           C p0
 163         std     %f14, [%o5+16]
 164         fmuld   %f2, %f8, %f16
 165         std     %f12, [%o5+24]
 166         fmuld   %f2, %f6, %f4
 167         add     %o1, 4, %o1             C up++
 168         b       .L4
 169         add     %o0, -4, %o0
 170
 171         .align  16
 172 .L_five_or_more:
 173         subcc   %o2, 1, %o2
 174         ld      [%o1], %f11             C read up[i]
 175         fdtox   %f16, %f14
 176         ldx     [%o5+16], %g2           C p16
 177         fdtox   %f4, %f12
 178         ldx     [%o5+24], %g1           C p0
 179         std     %f14, [%o5+16]
 180         fmuld   %f2, %f8, %f16
 181         std     %f12, [%o5+24]
 182         fmuld   %f2, %f6, %f4
 183         add     %o1, 4, %o1             C up++
 184         bne,pt  %icc, .Loop
 185         fxtod   %f10, %f2
 186         b,a     .L5
 187
 188 C BEGIN MAIN LOOP
 189         .align 16
 190 C -- 0
 191 .Loop:  nop
 192         subcc   %o2, 1, %o2
 193         ld      [%o1], %f11             C read up[i]
 194         fdtox   %f16, %f14
 195 C -- 1
 196         sllx    %g2, 16, %g4            C (p16 << 16)
 197         add     %o0, 4, %o0             C rp++
 198         ldx     [%o5+0], %g2            C p16
 199         fdtox   %f4, %f12
 200 C -- 2
 201         nop
 202         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 203         ldx     [%o5+8], %g1            C p0
 204         fanop
 205 C -- 3
 206         nop
 207         add     %g3, %g4, %g4           C p += cy
 208         std     %f14, [%o5+0]
 209         fmuld   %f2, %f8, %f16
 210 C -- 4
 211         srlx    %g4, 32, %g3            C new cy
 212         add     %o1, 4, %o1             C up++
 213         std     %f12, [%o5+8]
 214         fmuld   %f2, %f6, %f4
 215 C -- 5
 216         xor     %o5, 16, %o5            C alternate scratch variables
 217         stw     %g4, [%o0-4]
 218         bne,pt  %icc, .Loop
 219         fxtod   %f10, %f2
 220 C END MAIN LOOP
 221
 222 .L5:    fdtox   %f16, %f14
 223         sllx    %g2, 16, %g4            C (p16 << 16)
 224         ldx     [%o5+0], %g2            C p16
 225         fdtox   %f4, %f12
 226         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 227         ldx     [%o5+8], %g1            C p0
 228         add     %g4, %g3, %g4           C p += cy
 229         std     %f14, [%o5+0]
 230         fmuld   %f2, %f8, %f16
 231         std     %f12, [%o5+8]
 232         fmuld   %f2, %f6, %f4
 233         xor     %o5, 16, %o5
 234         stw     %g4, [%o0+0]
 235         srlx    %g4, 32, %g3            C new cy
 236
 237 .L4:    fdtox   %f16, %f14
 238         sllx    %g2, 16, %g4            C (p16 << 16)
 239         ldx     [%o5+0], %g2            C p16
 240         fdtox   %f4, %f12
 241         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 242         ldx     [%o5+8], %g1            C p0
 243         add     %g3, %g4, %g4           C p += cy
 244         std     %f14, [%o5+0]
 245         std     %f12, [%o5+8]
 246         xor     %o5, 16, %o5
 247         stw     %g4, [%o0+4]
 248         srlx    %g4, 32, %g3            C new cy
 249
 250 .L3:    sllx    %g2, 16, %g4            C (p16 << 16)
 251         ldx     [%o5+0], %g2            C p16
 252         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 253         ldx     [%o5+8], %g1            C p0
 254         add     %g3, %g4, %g4           C p += cy
 255         xor     %o5, 16, %o5
 256         stw     %g4, [%o0+8]
 257         srlx    %g4, 32, %g3            C new cy
 258
 259 .L2:    sllx    %g2, 16, %g4            C (p16 << 16)
 260         ldx     [%o5+0], %g2            C p16
 261         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 262         ldx     [%o5+8], %g1            C p0
 263         add     %g3, %g4, %g4           C p += cy
 264         stw     %g4, [%o0+12]
 265         srlx    %g4, 32, %g3            C new cy
 266
 267 .L1:    sllx    %g2, 16, %g4            C (p16 << 16)
 268         add     %g1, %g4, %g4           C p = p0 + (p16 << 16)
 269         add     %g3, %g4, %g4           C p += cy
 270         stw     %g4, [%o0+16]
 271         srlx    %g4, 32, %g3            C new cy
 272
 273         mov     %g3, %o0
 274         retl
 275         sub     %sp, -FSIZE, %sp
 276 EPILOGUE(mpn_mul_1)