1 dnl Alpha mpn_modexact_1c_odd -- mpn exact remainder
3 dnl Copyright 2003, 2004 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
29 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
32 C This code follows the "alternate" code in mpn/generic/mode1o.c,
33 C eliminating cbit+climb from the dependent chain. This leaves,
36 C 1 3 1 subq y = x - h
37 C 23 13 7 mulq q = y * inverse
38 C 23 14 7 umulh h = high (q * d)
42 C In each case, the load latency, loop control, and extra carry bit handling
43 C hide under the multiply latencies. Those latencies are long enough that
44 C we don't need to worry about alignment or pairing to squeeze out
47 C For the first limb, some of the loop code is broken out and scheduled back
48 C since it can be done earlier.
50 C - The first ldq src[0] is near the start of the routine, for maximum
53 C - The subq y=x-climb can be done without waiting for the inverse.
55 C - The mulq y*inverse is replicated after the final subq for the inverse,
56 C instead of branching to the mulq in the main loop. On ev4 a branch
57 C there would cost cycles, but we can hide them under the mulq latency.
59 C For the last limb, high<divisor is tested and if that's true a subtract
60 C and addback is done, as per the main mpn/generic/mode1o.c code. This is a
61 C data-dependent branch, but we're waiting for umulh so any penalty should
62 C hide there. The multiplies saved would be worth the cost anyway.
66 C For size==1, a plain division (done bitwise say) might be faster than
67 C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
68 C ev5. A call to gcc __remqu might be a possibility.
71 PROLOGUE(mpn_modexact_1c_odd,gp)
78 LEA(r0, binvert_limb_table)
79 srl r18, 1, r20 C d >> 1
81 and r20, 127, r20 C idx = d>>1 & 0x7F
83 addq r0, r20, r21 C table + idx
85 ifelse(bwx_available_p,1,
86 ` ldbu r20, 0(r21) C table[idx], inverse 8 bits
88 ldq_u r20, 0(r21) C table[idx] qword
89 extbl r20, r21, r20 C table[idx], inverse 8 bits
92 mull r20, r20, r7 C i*i
93 addq r20, r20, r20 C 2*i
95 ldq r2, 0(r16) C x = s = src[0]
96 lda r17, -1(r17) C size--
97 clr r0 C initial cbit=0
99 mull r7, r18, r7 C i*i*d
101 subq r20, r7, r20 C 2*i-i*i*d, inverse 16 bits
103 mull r20, r20, r7 C i*i
104 addq r20, r20, r20 C 2*i
106 mull r7, r18, r7 C i*i*d
108 subq r20, r7, r20 C 2*i-i*i*d, inverse 32 bits
110 mulq r20, r20, r7 C i*i
111 addq r20, r20, r20 C 2*i
113 mulq r7, r18, r7 C i*i*d
114 subq r2, r19, r3 C y = x - climb
116 subq r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits
118 ASSERT(r7, C should have d*inv==1 mod 2^64
122 mulq r3, r20, r4 C first q = y * inv
124 beq r17, L(one) C if size==1
130 C r16 src, incrementing
131 C r17 size, decrementing
136 ldq r1, 0(r16) C s = src[i]
137 subq r1, r0, r2 C x = s - cbit
138 cmpult r1, r0, r0 C new cbit = s < cbit
140 subq r2, r19, r3 C y = x - climb
142 mulq r3, r20, r4 C q = y * inv
144 cmpult r2, r19, r5 C cbit2 = x < climb
145 addq r5, r0, r0 C cbit += cbit2
146 lda r16, 8(r16) C src++
147 lda r17, -1(r17) C size--
149 umulh r4, r18, r19 C climb = q * d
150 bne r17, L(top) C while 2 or more limbs left
159 ldq r1, 0(r16) C s = src[size-1] high limb
161 cmpult r1, r18, r2 C test high<divisor
162 bne r2, L(skip) C skip if so
164 C can't skip a division, repeat loop code
166 subq r1, r0, r2 C x = s - cbit
167 cmpult r1, r0, r0 C new cbit = s < cbit
169 subq r2, r19, r3 C y = x - climb
171 mulq r3, r20, r4 C q = y * inv
173 cmpult r2, r19, r5 C cbit2 = x < climb
174 addq r5, r0, r0 C cbit += cbit2
176 umulh r4, r18, r19 C climb = q * d
178 addq r19, r0, r0 C return climb + cbit
184 C with high<divisor, the final step can be just (cbit+climb)-s and
185 C an addback of d if that underflows
187 addq r19, r0, r19 C c = climb + cbit
189 subq r19, r1, r2 C c - s
190 cmpult r19, r1, r3 C c < s
192 addq r2, r18, r0 C return c-s + divisor
194 cmoveq r3, r2, r0 C return c-s if no underflow