mpn/ia64/mode1o.asm

   1 dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
   2
   3 dnl  Contributed to the GNU project by Kevin Ryde.
   4
   5 dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
   6 dnl
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or
  10 dnl  modify it under the terms of the GNU Lesser General Public License as
  11 dnl  published by the Free Software Foundation; either version 3 of the
  12 dnl  License, or (at your option) any later version.
  13 dnl
  14 dnl  The GNU MP Library is distributed in the hope that it will be useful,
  15 dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 dnl  Lesser General Public License for more details.
  18 dnl
  19 dnl  You should have received a copy of the GNU Lesser General Public License
  20 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
  21
  22 include(`../config.m4')
  23
  24
  25 C            cycles/limb
  26 C Itanium:      15
  27 C Itanium 2:     8
  28
  29
  30 dnl  Usage: ABI32(`code')
  31 dnl
  32 dnl  Emit the given code only under HAVE_ABI_32.
  33 dnl
  34 define(ABI32,
  35 m4_assert_onearg()
  36 `ifdef(`HAVE_ABI_32',`$1')')
  37
  38
  39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
  40 C                                mp_limb_t divisor, mp_limb_t carry);
  41 C
  42 C The modexact algorithm is usually conceived as a dependent chain
  43 C
  44 C       l = src[i] - c
  45 C       q = low(l * inverse)
  46 C       c = high(q*divisor) + (src[i]<c)
  47 C
  48 C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
  49 C separately (off the dependent chain) and using
  50 C
  51 C       q = low(c * inverse + si)
  52 C       c = high(q*divisor + c)
  53 C
  54 C This means the dependent chain is simply xma.l followed by xma.hu, for a
  55 C total 8 cycles/limb on itanium-2.
  56 C
  57 C The reason xma.hu works for the new c is that the low of q*divisor is
  58 C src[i]-c (being the whole purpose of the q generated, and it can be
  59 C verified algebraically).  If there was an underflow from src[i]-c, then
  60 C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
  61 C the same as the borrow bit (src[i]<c) gives in the first style shown.
  62 C
  63 C Incidentally, fcmp is not an option for treating src[i]-c, since it
  64 C apparently traps to the kernel for unnormalized operands like those used
  65 C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
  66 C cycles.
  67 C
  68 C
  69 C First Limb:
  70 C
  71 C The first limb uses q = (src[0]-c) * inverse shown in the first style.
  72 C This lets us get the first q as soon as the inverse is ready, without
  73 C going through si=s*inverse.  Basically at the start we have c and can use
  74 C it while waiting for the inverse, whereas for the second and subsequent
  75 C limbs it's the other way around, ie. we have the inverse and are waiting
  76 C for c.
  77 C
  78 C At .Lentry the first two instructions in the loop have been done already.
  79 C The load of f11=src[1] at the start (predicated on size>=2), and the
  80 C calculation of q by the initial different scheme.
  81 C
  82 C
  83 C Entry Sequence:
  84 C
  85 C In the entry sequence, the critical path is the calculation of the
  86 C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
  87 C established nice and early so the br.cloop's should predict perfectly.
  88 C And the load for the low limbs src[0] and src[1] can be initiated long
  89 C ahead of where they're needed.
  90 C
  91 C
  92 C Inverse Calculation:
  93 C
  94 C The initial 8-bit inverse is calculated using a table lookup.  If it hits
  95 C L1 (which is likely if we're called several times) then it should take a
  96 C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
  97 C the best approach, on balance.  It could be done bitwise, but that would
  98 C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
  99 C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
 100 C but that would be about 11 cycles.
 101 C
 102 C The table is not the same as binvert_limb_table, instead it's 256 bytes,
 103 C designed to be indexed by the low byte of the divisor.  The divisor is
 104 C always odd, so the relevant data is every second byte in the table.  The
 105 C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
 106 C cycle because it must go down I0, and we're using the first I0 slot to get
 107 C ip.  The extra 128 bytes of padding should be insignificant compared to
 108 C typical ia64 code bloat.
 109 C
 110 C Having the table in .text allows us to use IP-relative addressing,
 111 C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
 112 C IP-relative, it gets a linker relocation overflow on GNU/Linux.
 113 C
 114 C
 115 C Load Scheduling:
 116 C
 117 C In the main loop, the data loads are scheduled for an L2 hit, which means
 118 C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
 119 C any case that scheduling is achieved simply by doing the load (and xmpy.l
 120 C for "si") in the immediately preceding iteration.
 121 C
 122 C The main loop requires size >= 2, and we handle size==1 by an initial
 123 C br.cloop to enter the loop only if size>1.  Since ar.lc is established
 124 C early, this should predict perfectly.
 125 C
 126 C
 127 C Not done:
 128 C
 129 C Consideration was given to using a plain "(src[0]-c) % divisor" for
 130 C size==1, but cycle counting suggests about 50 for the sort of approach
 131 C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
 132 C L1 hits for their respective fetching.)
 133 C
 134 C Consideration was given to a test for high<divisor and replacing the last
 135 C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
 136 C Branching on high<divisor wouldn't be good since a mispredict would cost
 137 C more than the loop iteration saved, and the condition is of course data
 138 C dependent.  So the theory would be to shorten the loop count if
 139 C high<divisor, and predicate extra operations at the end.  That would mean
 140 C a gain of 6 when high<divisor, or a cost of 2 if not.
 141 C
 142 C Whether such a tradeoff is a win on average depends on assumptions about
 143 C how many bits in the high and the divisor.  If both are uniformly
 144 C distributed then high<divisor about 50% of the time.  But smallish
 145 C divisors (less chance of high<divisor) might be more likely from
 146 C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
 147 C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
 148 C for the moment, it's felt the gain is not really enough to be worth the
 149 C trouble.
 150 C
 151 C
 152 C Enhancement:
 153 C
 154 C Process two source limbs per iteration using a two-limb inverse and a
 155 C sequence like
 156 C
 157 C       ql  = low (c * il + sil)        quotient low limb
 158 C       qlc = high(c * il + sil)
 159 C       qh1 = low (c * ih + sih)        quotient high, partial
 160 C
 161 C       cl = high (ql * d + c)          carry out of low
 162 C       qh = low (qlc * 1 + qh1)        quotient high limb
 163 C
 164 C       new c = high (qh * d + cl)      carry out of high
 165 C
 166 C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
 167 C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
 168 C chain with 4 multiplies.  The bigger inverse would take extra time to
 169 C calculate, but a one limb iteration to handle an odd size could be done as
 170 C soon as 64-bits of inverse were ready.
 171 C
 172 C Perhaps this could even extend to a 3 limb inverse, which might promise 17
 173 C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
 174 C
 175
 176 ASM_START()
 177         .explicit
 178
 179         .text
 180         .align  32
 181 .Ltable:
 182 data1   0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
 183 data1   0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
 184 data1   0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
 185 data1   0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
 186 data1   0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
 187 data1   0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
 188 data1   0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
 189 data1   0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
 190 data1   0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
 191 data1   0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
 192 data1   0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
 193 data1   0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
 194 data1   0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
 195 data1   0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
 196 data1   0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
 197 data1   0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
 198
 199
 200 PROLOGUE(mpn_modexact_1c_odd)
 201
 202         C r32   src
 203         C r33   size
 204         C r34   divisor
 205         C r35   carry
 206
 207         .prologue
 208 .Lhere:
 209 { .mmi; add     r33 = -1, r33           C M0  size-1
 210         mov     r14 = 2                 C M1  2
 211         mov     r15 = ip                C I0  .Lhere
 212 }{.mmi; setf.sig f6 = r34               C M2  divisor
 213         setf.sig f9 = r35               C M3  carry
 214         zxt1    r3 = r34                C I1  divisor low byte
 215 }       ;;
 216
 217 { .mmi; add     r3 = .Ltable-.Lhere, r3 C M0  table offset ip and index
 218         sub     r16 = 0, r34            C M1  -divisor
 219         .save   ar.lc, r2
 220         mov     r2 = ar.lc              C I0
 221 }{.mmi; .body
 222         setf.sig f13 = r14              C M2  2 in significand
 223         mov     r17 = -1                C M3  -1
 224 ABI32(` zxt4    r33 = r33')             C I1  size extend
 225 }       ;;
 226
 227 { .mmi; add     r3 = r3, r15            C M0  table entry address
 228 ABI32(` addp4   r32 = 0, r32')          C M1  src extend
 229         mov     ar.lc = r33             C I0  size-1 loop count
 230 }{.mmi; setf.sig f12 = r16              C M2  -divisor
 231         setf.sig f8 = r17               C M3  -1
 232 }       ;;
 233
 234 { .mmi; ld1     r3 = [r3]               C M0  inverse, 8 bits
 235         ldf8    f10 = [r32], 8          C M1  src[0]
 236         cmp.ne  p6,p0 = 0, r33          C I0  test size!=1
 237 }       ;;
 238
 239         C Wait for table load.
 240         C Hope for an L1 hit of 1 cycles to ALU, but could be more.
 241         setf.sig f7 = r3                C M2  inverse, 8 bits
 242 (p6)    ldf8    f11 = [r32], 8          C M1  src[1], if size!=1
 243         ;;
 244
 245         C 5 cycles
 246
 247         C f6    divisor
 248         C f7    inverse, being calculated
 249         C f8    -1, will be -inverse
 250         C f9    carry
 251         C f10   src[0]
 252         C f11   src[1]
 253         C f12   -divisor
 254         C f13   2
 255         C f14   scratch
 256
 257         xmpy.l  f14 = f13, f7           C 2*i
 258         xmpy.l  f7 = f7, f7             C i*i
 259         ;;
 260         xma.l   f7 = f7, f12, f14       C i*i*-d + 2*i, inverse 16 bits
 261         ;;
 262
 263         xmpy.l  f14 = f13, f7           C 2*i
 264         xmpy.l  f7 = f7, f7             C i*i
 265         ;;
 266         xma.l   f7 = f7, f12, f14       C i*i*-d + 2*i, inverse 32 bits
 267         ;;
 268
 269         xmpy.l  f14 = f13, f7           C 2*i
 270         xmpy.l  f7 = f7, f7             C i*i
 271         ;;
 272
 273         xma.l   f7 = f7, f12, f14       C i*i*-d + 2*i, inverse 64 bits
 274         xma.l   f10 = f9, f8, f10       C sc = c * -1 + src[0]
 275         ;;
 276 ASSERT(p6, `
 277         xmpy.l  f15 = f6, f7 ;; C divisor*inverse
 278         getf.sig r31 = f15 ;;
 279         cmp.eq  p6,p0 = 1, r31  C should == 1
 280 ')
 281
 282         xmpy.l  f10 = f10, f7           C q = sc * inverse
 283         xmpy.l  f8 = f7, f8             C -inverse = inverse * -1
 284         br.cloop.sptk.few.clr .Lentry   C main loop, if size > 1
 285         ;;
 286
 287         C size==1, finish up now
 288         xma.hu  f9 = f10, f6, f9        C c = high(q * divisor + c)
 289         mov     ar.lc = r2              C I0
 290         ;;
 291         getf.sig r8 = f9                C M2  return c
 292         br.ret.sptk.many b0
 293
 294
 295
 296 .Ltop:
 297         C r2    saved ar.lc
 298         C f6    divisor
 299         C f7    inverse
 300         C f8    -inverse
 301         C f9    carry
 302         C f10   src[i] * inverse
 303         C f11   scratch src[i+1]
 304
 305         add     r16 = 160, r32
 306         ldf8    f11 = [r32], 8          C src[i+1]
 307         ;;
 308         C 2 cycles
 309
 310         lfetch  [r16]
 311         xma.l   f10 = f9, f8, f10       C q = c * -inverse + si
 312         ;;
 313         C 3 cycles
 314
 315 .Lentry:
 316         xma.hu  f9 = f10, f6, f9        C c = high(q * divisor + c)
 317         xmpy.l  f10 = f11, f7           C si = src[i] * inverse
 318         br.cloop.sptk.few.clr .Ltop
 319         ;;
 320
 321
 322
 323         xma.l   f10 = f9, f8, f10       C q = c * -inverse + si
 324         mov     ar.lc = r2              C I0
 325         ;;
 326         xma.hu  f9 = f10, f6, f9        C c = high(q * divisor + c)
 327         ;;
 328         getf.sig r8 = f9                C M2  return c
 329         br.ret.sptk.many b0
 330
 331 EPILOGUE()