1 dnl Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
3 dnl Copyright 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C cycles/bitpair (1x1 gcd)
24 C Itanium: 14 (approx)
28 C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
30 C The entry sequence is designed to expect xsize>1 and hence a modexact
31 C call. This ought to be more common than a 1x1 operation. Our critical
32 C path is thus stripping factors of 2 from y, calling modexact, then
33 C stripping factors of 2 from the x remainder returned.
35 C The common factors of 2 between x and y must be determined using the
36 C original x, not the remainder from the modexact. This is done with
37 C x_orig which is xp[0]. There's plenty of time to do this while the rest
38 C of the modexact etc is happening.
40 C It's possible xp[0] is zero. In this case the trailing zeros calculation
41 C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
42 C have, making min(x_twos,y_twos) == y_twos.
44 C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
45 C stripping factors of 2 from abs(x-y). Those factors of two are
46 C determined from just y-x, without the abs(), since there's the same
47 C number of trailing zeros on n or -n in twos complement. That makes the
52 C 3 andcm (x-y-1)&~(x-y)
53 C 2 popcnt trailing zeros
54 C 3 shr.u strip abs(x-y)
58 C The selection of x-y versus y-x for abs(x-y), and the selection of the
59 C minimum of x and y, is done in parallel with the above.
61 C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
62 C average, hence the final 6.3 cycles/bitpair.
64 C The loop is not as fast as one might hope, since there's extra latency
65 C from andcm going across to the `multimedia' popcnt, and vice versa from
66 C multimedia shr.u back to the integer sub.
68 C The loop branch is .sptk.clr since we usually expect a good number of
69 C iterations, and the iterations are data dependent so it's unlikely past
70 C results will predict anything much about the future.
74 C An alternate algorithm which didn't strip all twos, but instead applied
75 C tbit and predicated extr on x, and then y, was attempted. The loop was 6
76 C cycles, but the algorithm is an average 1.25 iterations per bitpair for a
77 C total 7.25 c/bp, which is slower than the current approach.
81 C Perhaps we could do something tricky by extracting a few high bits and a
82 C few low bits from the operands, and looking up a table which would give a
83 C set of predicates to control some shifts or subtracts or whatever. That
84 C could knock off multiple bits per iteration.
86 C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr
87 C only going down I0), perhaps it'd be possible to shift left instead,
88 C using add. That would mean keeping track of the lowest not-yet-zeroed
89 C bit, using some sort of mask.
93 C This code is not designed for itanium-1 and in fact doesn't run well on
94 C that chip. The loop seems to be about 21 cycles, probably because we end
95 C up with a 10 cycle replay for not forcibly scheduling the shr.u latency.
96 C Lack of branch hints might introduce a couple of bubbles too.
100 .explicit C What does this mean?
102 C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
103 .global mpn_modexact_1c_odd
104 .type mpn_modexact_1c_odd,@function
115 define(y, r34) define(inputs, 3)
117 define(save_pfs, r36)
119 define(x_orig_one, r38)
120 define(y_twos, r39) define(locals, 5)
122 define(out_xsize, r41)
123 define(out_divisor, r42)
124 define(out_carry, r43) define(outputs, 4)
129 ` addp4 r9 = 0, xp_orig define(xp,r9)', C M0
130 ` define(xp,xp_orig)')
131 .save ar.pfs, save_pfs
132 alloc save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2
134 mov save_rp = b0 C I0
136 add r10 = -1, y C M3 y-1
139 { .mmi; ld8 x = [xp] C M0 x = xp[0] if no modexact
140 ld8 x_orig = [xp] C M1 orig x for common twos
141 cmp.ne p6,p0 = 1, xsize C I0
142 }{ .mmi; andcm y_twos = r10, y C M2 (y-1)&~y
143 mov out_xp = xp_orig C M3
144 mov out_xsize = xsize C I1
151 popcnt y_twos = y_twos C I0 y twos
156 { .mmi; add x_orig_one = -1, x_orig C M0 orig x-1
157 shr.u out_divisor = y, y_twos C I0 y without twos
158 }{ shr.u y = y, y_twos C I1 y without twos
159 (p6) br.call.sptk.many b0 = mpn_modexact_1c_odd C if xsize>1
162 C modexact can leave x==0
163 { .mmi; cmp.eq p6,p0 = 0, x C M0 if {xp,xsize} % y == 0
164 andcm x_orig = x_orig_one, x_orig C M1 orig (x-1)&~x
165 add r9 = -1, x C I0 x-1
168 { .mmi; andcm r9 = r9, x C M0 (x-1)&~x
169 mov b0 = save_rp C I0
174 popcnt x_orig = x_orig C I0 orig x twos
176 popcnt r9 = r9 C I0 x twos
181 { cmp.lt p7,p0 = x_orig, y_twos C M0 orig x_twos < y_twos
182 shr.u x = x, r9 C I0 x odd
185 { (p7) mov y_twos = x_orig C M0 common twos
186 add r10 = -1, y C I0 y-1
187 (p6) br.dpnt.few .Ldone_y C B0 x%y==0 then result y
193 C No noticable difference in speed for the loop aligned to
199 C r38 common twos, for use at end
201 { .mmi; cmp.gtu p8,p9 = x, y C M0 x>y
202 cmp.ne p10,p0 = x, y C M1 x==y
203 sub r9 = y, x C I0 d = y - x
204 }{ .mmi; sub r10 = r10, x C M2 d-1 = y - x - 1
207 { .mmi; .pred.rel "mutex", p8, p9
208 (p8) sub x = x, y C M0 x>y use x=x-y, y unchanged
209 (p9) mov y = x C M1 y>=x use y=x
210 (p9) mov x = r9 C I0 y>=x use x=y-x
211 }{ .mmi; andcm r9 = r10, r9 C M2 (d-1)&~d
214 add r10 = -1, y C M0 new y-1
215 popcnt r9 = r9 C I0 twos on x-y
218 { shr.u x = x, r9 C I0 new x without twos
219 (p10) br.sptk.few.clr .Ltop
226 shl r8 = y, y_twos C I common factors of 2
228 mov ar.pfs = save_pfs C I0