1 dnl IA-64 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
30 define(`divisor', `r35')
32 define(`lshift', `r24')
33 define(`rshift', `r25')
35 C This code is a bit messy, and not as similar to mode1o.asm as desired.
37 C The critical path during initialization is for computing the inverse of the
38 C divisor. Since odd divisors are probably common, we conditionally execute
39 C the initial count_traling_zeros code and the downshift.
41 C Possible improvement: Merge more of the feed-in code into the inverse
48 data1 0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
49 data1 0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
50 data1 0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
51 data1 0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
52 data1 0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
53 data1 0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
54 data1 0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
55 data1 0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
56 data1 0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
57 data1 0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
58 data1 0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
59 data1 0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
60 data1 0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
61 data1 0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
62 data1 0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
63 data1 0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
66 PROLOGUE(mpn_divexact_1)
71 {.mmi; add r8 = -1, divisor C M0
73 tbit.z p8, p9 = divisor, 0 C I0
76 ` addp4 rp = 0, rp C M2 rp extend
77 addp4 up = 0, up C M3 up extend
78 sxt4 n = n') C I1 size extend
81 {.mmi; ld8 r20 = [up], 8 C M0 up[0]
82 (p8) andcm r8 = r8, divisor C M1
83 mov r15 = ip C I0 .Lhere
86 .pred.rel "mutex", p8, p9
87 (p9) mov rshift = 0 C M0
88 (p8) popcnt rshift = r8 C I0 r8 = cnt_lo_zeros(divisor)
89 cmp.eq p6, p10 = 1, n C I1
91 }{.mii; add r9 = .Ltab-.Lhere, r15 C M0
92 (p8) shr.u divisor = divisor, rshift C I0
95 }{.mmi; add n = -4, n C M0 size-1
96 (p10) ld8 r21 = [up], 8 C M1 up[1]
98 }{.mfi; setf.sig f6 = divisor C M2 divisor
99 mov f9 = f0 C M3 carry FIXME
100 zxt1 r3 = divisor C I1 divisor low byte
102 }{.mmi; add r3 = r9, r3 C M0 table offset ip and index
103 sub r16 = 0, divisor C M1 -divisor
105 }{.mmi; sub lshift = 64, rshift C M2
106 setf.sig f13 = r14 C M3 2 in significand
109 }{.mmi; ld1 r3 = [r3] C M0 inverse, 8 bits
111 mov ar.lc = n C I0 size-1 loop count
112 }{.mmi; setf.sig f12 = r16 C M2 -divisor
113 setf.sig f8 = r17 C M3 -1
114 cmp.eq p7, p0 = -2, n C I1
116 }{.mmi; setf.sig f7 = r3 C M2 inverse, 8 bits
117 cmp.eq p8, p0 = -1, n C M0
118 shr.u r23 = r20, rshift C I0
123 C f7 inverse, being calculated
124 C f8 -1, will be -inverse
130 xmpy.l f14 = f13, f7 C Newton 2*i
131 xmpy.l f7 = f7, f7 C Newton i*i
133 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 16 bits
135 setf.sig f10 = r23 C speculative, used iff n = 1
136 xmpy.l f14 = f13, f7 C Newton 2*i
137 shl r22 = r21, lshift C speculative, used iff n > 1
138 xmpy.l f7 = f7, f7 C Newton i*i
140 or r31 = r22, r23 C speculative, used iff n > 1
141 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 32 bits
142 shr.u r23 = r21, rshift C speculative, used iff n > 1
144 setf.sig f11 = r31 C speculative, used iff n > 1
145 xmpy.l f14 = f13, f7 C Newton 2*i
146 xmpy.l f7 = f7, f7 C Newton i*i
148 xma.l f7 = f7, f12, f14 C Newton i*i*-d + 2*i, 64 bits
150 (p7) br.cond.dptk .Ln2
151 (p10) br.cond.dptk .grt3
154 .Ln1: xmpy.l f12 = f10, f7 C q = ulimb * inverse
158 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
159 xmpy.l f12 = f11, f7 C q = ulimb * inverse
164 ld8 r21 = [up], 8 C up[2]
165 xmpy.l f8 = f7, f8 C -inverse = inverse * -1
167 shl r22 = r21, lshift
169 xmpy.l f12 = f11, f7 C q = ulimb * inverse
172 shr.u r23 = r21, rshift
175 (p8) br.cond.dptk .Lx3 C branch for n = 3
180 .Loop: ld8 r21 = [up], 8
181 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
183 .Lent: add r16 = 160, up
184 shl r22 = r21, lshift
187 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
188 xmpy.l f10 = f11, f7 C si = ulimb * inverse
191 shr.u r23 = r21, rshift
195 br.cloop.sptk.few.clr .Loop
198 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
200 .Lx3: stf8 [rp] = f12, 8
201 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
202 xmpy.l f10 = f11, f7 C si = ulimb * inverse
206 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
208 .Lx2: stf8 [rp] = f12, 8
209 xma.hu f9 = f12, f6, f9 C c = high(q * divisor + c)
210 xmpy.l f10 = f11, f7 C si = ulimb * inverse
212 xma.l f12 = f9, f8, f10 C q = c * -inverse + si
214 .Lx1: stf8 [rp] = f12, 8