1 dnl Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
3 dnl Contributed to the GNU project by Kevin Ryde.
5 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or
10 dnl modify it under the terms of the GNU Lesser General Public License as
11 dnl published by the Free Software Foundation; either version 3 of the
12 dnl License, or (at your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful,
15 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 dnl Lesser General Public License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
30 dnl Usage: ABI32(`code')
32 dnl Emit the given code only under HAVE_ABI_32.
36 `ifdef(`HAVE_ABI_32',`$1')')
39 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
40 C mp_limb_t divisor, mp_limb_t carry);
42 C The modexact algorithm is usually conceived as a dependent chain
45 C q = low(l * inverse)
46 C c = high(q*divisor) + (src[i]<c)
48 C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
49 C separately (off the dependent chain) and using
51 C q = low(c * inverse + si)
52 C c = high(q*divisor + c)
54 C This means the dependent chain is simply xma.l followed by xma.hu, for a
55 C total 8 cycles/limb on itanium-2.
57 C The reason xma.hu works for the new c is that the low of q*divisor is
58 C src[i]-c (being the whole purpose of the q generated, and it can be
59 C verified algebraically). If there was an underflow from src[i]-c, then
60 C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
61 C the same as the borrow bit (src[i]<c) gives in the first style shown.
63 C Incidentally, fcmp is not an option for treating src[i]-c, since it
64 C apparently traps to the kernel for unnormalized operands like those used
65 C and generated by ldf8 and xma. On one GNU/Linux system it took about 1200
71 C The first limb uses q = (src[0]-c) * inverse shown in the first style.
72 C This lets us get the first q as soon as the inverse is ready, without
73 C going through si=s*inverse. Basically at the start we have c and can use
74 C it while waiting for the inverse, whereas for the second and subsequent
75 C limbs it's the other way around, ie. we have the inverse and are waiting
78 C At .Lentry the first two instructions in the loop have been done already.
79 C The load of f11=src[1] at the start (predicated on size>=2), and the
80 C calculation of q by the initial different scheme.
85 C In the entry sequence, the critical path is the calculation of the
86 C inverse, so this is begun first and optimized. Apart from that, ar.lc is
87 C established nice and early so the br.cloop's should predict perfectly.
88 C And the load for the low limbs src[0] and src[1] can be initiated long
89 C ahead of where they're needed.
92 C Inverse Calculation:
94 C The initial 8-bit inverse is calculated using a table lookup. If it hits
95 C L1 (which is likely if we're called several times) then it should take a
96 C total 4 cycles, otherwise hopefully L2 for 9 cycles. This is considered
97 C the best approach, on balance. It could be done bitwise, but that would
98 C probably be about 14 cycles (2 per bit beyond the first couple). Or it
99 C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
100 C but that would be about 11 cycles.
102 C The table is not the same as binvert_limb_table, instead it's 256 bytes,
103 C designed to be indexed by the low byte of the divisor. The divisor is
104 C always odd, so the relevant data is every second byte in the table. The
105 C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
106 C cycle because it must go down I0, and we're using the first I0 slot to get
107 C ip. The extra 128 bytes of padding should be insignificant compared to
108 C typical ia64 code bloat.
110 C Having the table in .text allows us to use IP-relative addressing,
111 C avoiding a fetch from ltoff. .rodata is apparently not suitable for use
112 C IP-relative, it gets a linker relocation overflow on GNU/Linux.
117 C In the main loop, the data loads are scheduled for an L2 hit, which means
118 C 6 cycles for the data ready to use. In fact we end up 7 cycles ahead. In
119 C any case that scheduling is achieved simply by doing the load (and xmpy.l
120 C for "si") in the immediately preceding iteration.
122 C The main loop requires size >= 2, and we handle size==1 by an initial
123 C br.cloop to enter the loop only if size>1. Since ar.lc is established
124 C early, this should predict perfectly.
129 C Consideration was given to using a plain "(src[0]-c) % divisor" for
130 C size==1, but cycle counting suggests about 50 for the sort of approach
131 C taken by gcc __umodsi3, versus about 47 for the modexact. (Both assuming
132 C L1 hits for their respective fetching.)
134 C Consideration was given to a test for high<divisor and replacing the last
135 C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
136 C Branching on high<divisor wouldn't be good since a mispredict would cost
137 C more than the loop iteration saved, and the condition is of course data
138 C dependent. So the theory would be to shorten the loop count if
139 C high<divisor, and predicate extra operations at the end. That would mean
140 C a gain of 6 when high<divisor, or a cost of 2 if not.
142 C Whether such a tradeoff is a win on average depends on assumptions about
143 C how many bits in the high and the divisor. If both are uniformly
144 C distributed then high<divisor about 50% of the time. But smallish
145 C divisors (less chance of high<divisor) might be more likely from
146 C applications (mpz_divisible_ui, mpz_gcd_ui, etc). Though biggish divisors
147 C would be normal internally from say mpn/generic/perfsqr.c. On balance,
148 C for the moment, it's felt the gain is not really enough to be worth the
154 C Process two source limbs per iteration using a two-limb inverse and a
157 C ql = low (c * il + sil) quotient low limb
158 C qlc = high(c * il + sil)
159 C qh1 = low (c * ih + sih) quotient high, partial
161 C cl = high (ql * d + c) carry out of low
162 C qh = low (qlc * 1 + qh1) quotient high limb
164 C new c = high (qh * d + cl) carry out of high
166 C This would be 13 cycles/iteration, giving 6.5 cycles/limb. The two limb
167 C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
168 C chain with 4 multiplies. The bigger inverse would take extra time to
169 C calculate, but a one limb iteration to handle an odd size could be done as
170 C soon as 64-bits of inverse were ready.
172 C Perhaps this could even extend to a 3 limb inverse, which might promise 17
173 C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
182 data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
183 data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
184 data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
185 data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
186 data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
187 data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
188 data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
189 data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
190 data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
191 data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
192 data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
193 data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
194 data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
195 data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
196 data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
197 data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
200 PROLOGUE(mpn_modexact_1c_odd)
209 { .mmi; add r33 = -1, r33 C M0 size-1
211 mov r15 = ip C I0 .Lhere
212 }{.mmi; setf.sig f6 = r34 C M2 divisor
213 setf.sig f9 = r35 C M3 carry
214 zxt1 r3 = r34 C I1 divisor low byte
217 { .mmi; add r3 = .Ltable-.Lhere, r3 C M0 table offset ip and index
218 sub r16 = 0, r34 C M1 -divisor
222 setf.sig f13 = r14 C M2 2 in significand
224 ABI32(` zxt4 r33 = r33') C I1 size extend
227 { .mmi; add r3 = r3, r15 C M0 table entry address
228 ABI32(` addp4 r32 = 0, r32') C M1 src extend
229 mov ar.lc = r33 C I0 size-1 loop count
230 }{.mmi; setf.sig f12 = r16 C M2 -divisor
231 setf.sig f8 = r17 C M3 -1
234 { .mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
235 ldf8 f10 = [r32], 8 C M1 src[0]
236 cmp.ne p6,p0 = 0, r33 C I0 test size!=1
239 C Wait for table load.
240 C Hope for an L1 hit of 1 cycles to ALU, but could be more.
241 setf.sig f7 = r3 C M2 inverse, 8 bits
242 (p6) ldf8 f11 = [r32], 8 C M1 src[1], if size!=1
248 C f7 inverse, being calculated
249 C f8 -1, will be -inverse
257 xmpy.l f14 = f13, f7 C 2*i
258 xmpy.l f7 = f7, f7 C i*i
260 xma.l f7 = f7, f12, f14 C i*i*-d + 2*i, inverse 16 bits
263 xmpy.l f14 = f13, f7 C 2*i
264 xmpy.l f7 = f7, f7 C i*i
266 xma.l f7 = f7, f12, f14 C i*i*-d + 2*i, inverse 32 bits
269 xmpy.l f14 = f13, f7 C 2*i
270 xmpy.l f7 = f7, f7 C i*i
273 xma.l f7 = f7, f12, f14 C i*i*-d + 2*i, inverse 64 bits
274 xma.l f10 = f9, f8, f10 C sc = c * -1 + src[0]
277 xmpy.l f15 = f6, f7 ;; C divisor*inverse
278 getf.sig r31 = f15 ;;
279 cmp.eq p6,p0 = 1, r31 C should == 1
282 xmpy.l f10 = f10, f7 C q = sc * inverse
283 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
284 br.cloop.sptk.few.clr .Lentry C main loop, if size > 1
287 C size==1, finish up now
288 xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c)
291 getf.sig r8 = f9 C M2 return c
302 C f10 src[i] * inverse
303 C f11 scratch src[i+1]
306 ldf8 f11 = [r32], 8 C src[i+1]
311 xma.l f10 = f9, f8, f10 C q = c * -inverse + si
316 xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c)
317 xmpy.l f10 = f11, f7 C si = src[i] * inverse
318 br.cloop.sptk.few.clr .Ltop
323 xma.l f10 = f9, f8, f10 C q = c * -inverse + si
326 xma.hu f9 = f10, f6, f9 C c = high(q * divisor + c)
328 getf.sig r8 = f9 C M2 return c