Upload Tizen:Base source
[external/gmp.git] / mpn / alpha / bdiv_dbm1c.asm
1 dnl  Alpha mpn_bdiv_dbm1c.
2
3 dnl  Copyright 2008 Free Software Foundation, Inc.
4
5 dnl  This file is part of the GNU MP Library.
6
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of the GNU Lesser General Public License as published
9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
10 dnl  your option) any later version.
11
12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 dnl  License for more details.
16
17 dnl  You should have received a copy of the GNU Lesser General Public License
18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22 C      cycles/limb
23 C EV4:     42
24 C EV5:     18
25 C EV6:      3
26
27 C TODO
28 C  * Try less unrolling, 2-way should give the same performance.
29 C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
30 C    code size.
31 C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
32 C    path.  We have not tried very hard to find a better algorithm.  Perhaps
33 C    it would be a good task for the GNU superoptimizer.
34
35 C INPUT PARAMETERS
36 define(`rp', `r16')
37 define(`up', `r17')
38 define(`n',  `r18')
39 define(`bd', `r19')
40 define(`cy', `r19')
41
42
43 ASM_START()
44 PROLOGUE(mpn_bdiv_dbm1c)
45         mov     r20, r8
46
47         ldq     r24, 0(r17)
48         and     r18, 3, r28
49         lda     r18, -4(r18)
50         beq     r28, L(b0)
51         cmpeq   r28, 1, r21
52         bne     r21, L(b1)
53         cmpeq   r28, 2, r21
54         bne     r21, L(b2)
55
56
57 L(b3):  ldq     r2, 8(r17)
58         ldq     r3, 16(r17)
59         bgt     r18, L(gt3)
60
61         mulq    r24, r19, r5    C U1
62         umulh   r24, r19, r21   C U1
63         mulq    r2, r19, r6     C U1
64         umulh   r2, r19, r22    C U1
65         mulq    r3, r19, r7     C U1
66         umulh   r3, r19, r23    C U1
67         lda     r16, -32(r16)
68         br      L(cj3)
69
70 L(gt3): ldq     r0, 24(r17)
71         mulq    r24, r19, r5    C U1
72         umulh   r24, r19, r21   C U1
73         ldq     r1, 32(r17)
74         mulq    r2, r19, r6     C U1
75         umulh   r2, r19, r22    C U1
76         ldq     r2, 40(r17)
77         mulq    r3, r19, r7     C U1
78         umulh   r3, r19, r23    C U1
79         ldq     r3, 48(r17)
80         lda     r18, -4(r18)
81         lda     r17, 56(r17)
82         mulq    r0, r19, r4     C U1
83         bgt     r18, L(L3)
84
85         br      L(cj7)
86
87
88 L(b2):  ldq     r3, 8(r17)
89         bgt     r18, L(gt2)
90
91         mulq    r24, r19, r6    C U1
92         umulh   r24, r19, r22   C U1
93         mulq    r3, r19, r7     C U1
94         umulh   r3, r19, r23    C U1
95         lda     r16, -40(r16)
96         br      L(cj2)
97
98 L(gt2): ldq     r0, 16(r17)
99         ldq     r1, 24(r17)
100         mulq    r24, r19, r6    C U1
101         umulh   r24, r19, r22   C U1
102         ldq     r2, 32(r17)
103         mulq    r3, r19, r7     C U1
104         umulh   r3, r19, r23    C U1
105         ldq     r3, 40(r17)
106         lda     r18, -4(r18)
107         lda     r17, 48(r17)
108         mulq    r0, r19, r4     C U1
109         umulh   r0, r19, r20    C U1
110         lda     r16, -8(r16)
111         bgt     r18, L(gt6)
112
113         mulq    r1, r19, r5     C U1
114         br      L(cj6)
115
116 L(gt6): ldq     r0, 0(r17)
117         mulq    r1, r19, r5     C U1
118         br      L(L2)
119
120
121 L(b1):  bgt     r18, L(gt1)
122
123         mulq    r24, r19, r7    C U1
124         umulh   r24, r19, r23   C U1
125         lda     r16, -48(r16)
126         br      L(cj1)
127
128 L(gt1): ldq     r0, 8(r17)
129         ldq     r1, 16(r17)
130         ldq     r2, 24(r17)
131         mulq    r24, r19, r7    C U1
132         umulh   r24, r19, r23   C U1
133         ldq     r3, 32(r17)
134         lda     r18, -4(r18)
135         lda     r17, 40(r17)
136         mulq    r0, r19, r4     C U1
137         umulh   r0, r19, r20    C U1
138         lda     r16, -16(r16)
139         bgt     r18, L(gt5)
140
141         mulq    r1, r19, r5     C U1
142         umulh   r1, r19, r21    C U1
143         mulq    r2, r19, r6     C U1
144         br      L(cj5)
145
146 L(gt5): ldq     r0, 0(r17)
147         mulq    r1, r19, r5     C U1
148         umulh   r1, r19, r21    C U1
149         ldq     r1, 8(r17)
150         mulq    r2, r19, r6     C U1
151         br      L(L1)
152
153
154 L(b0):  ldq     r1, 8(r17)
155         ldq     r2, 16(r17)
156         ldq     r3, 24(r17)
157         lda     r17, 32(r17)
158         lda     r16, -24(r16)
159         mulq    r24, r19, r4    C U1
160         umulh   r24, r19, r20   C U1
161         bgt     r18, L(gt4)
162
163         mulq    r1, r19, r5     C U1
164         umulh   r1, r19, r21    C U1
165         mulq    r2, r19, r6     C U1
166         umulh   r2, r19, r22    C U1
167         mulq    r3, r19, r7     C U1
168         br      L(cj4)
169
170 L(gt4): ldq     r0, 0(r17)
171         mulq    r1, r19, r5     C U1
172         umulh   r1, r19, r21    C U1
173         ldq     r1, 8(r17)
174         mulq    r2, r19, r6     C U1
175         umulh   r2, r19, r22    C U1
176         ldq     r2, 16(r17)
177         mulq    r3, r19, r7     C U1
178         br      L(L0)
179
180 C *** MAIN LOOP START ***
181         ALIGN(16)
182 L(top): mulq    r0, r19, r4     C U1
183         subq    r8, r28, r8
184 L(L3):  umulh   r0, r19, r20    C U1
185         cmpult  r8, r5, r28
186         ldq     r0, 0(r17)
187         subq    r8, r5, r8
188         addq    r21, r28, r28
189         stq     r8, 0(r16)
190
191         mulq    r1, r19, r5     C U1
192         subq    r8, r28, r8
193 L(L2):  umulh   r1, r19, r21    C U1
194         cmpult  r8, r6, r28
195         ldq     r1, 8(r17)
196         subq    r8, r6, r8
197         addq    r22, r28, r28
198         stq     r8, 8(r16)
199
200         mulq    r2, r19, r6     C U1
201         subq    r8, r28, r8
202 L(L1):  umulh   r2, r19, r22    C U1
203         cmpult  r8, r7, r28
204         ldq     r2, 16(r17)
205         subq    r8, r7, r8
206         addq    r23, r28, r28
207         stq     r8, 16(r16)
208
209         mulq    r3, r19, r7     C U1
210         subq    r8, r28, r8
211 L(L0):  umulh   r3, r19, r23    C U1
212         cmpult  r8, r4, r28
213         ldq     r3, 24(r17)
214         subq    r8, r4, r8
215         addq    r20, r28, r28
216         stq     r8, 24(r16)
217
218         lda     r18, -4(r18)
219         lda     r17, 32(r17)
220         lda     r16, 32(r16)
221         bgt     r18, L(top)
222 C *** MAIN LOOP END ***
223
224         mulq    r0, r19, r4     C U1
225         subq    r8, r28, r8
226 L(cj7): umulh   r0, r19, r20    C U1
227         cmpult  r8, r5, r28
228         subq    r8, r5, r8
229         addq    r21, r28, r28
230         stq     r8, 0(r16)
231         mulq    r1, r19, r5     C U1
232         subq    r8, r28, r8
233 L(cj6): umulh   r1, r19, r21    C U1
234         cmpult  r8, r6, r28
235         subq    r8, r6, r8
236         addq    r22, r28, r28
237         stq     r8, 8(r16)
238         mulq    r2, r19, r6     C U1
239         subq    r8, r28, r8
240 L(cj5): umulh   r2, r19, r22    C U1
241         cmpult  r8, r7, r28
242         subq    r8, r7, r8
243         addq    r23, r28, r28
244         stq     r8, 16(r16)
245         mulq    r3, r19, r7     C U1
246         subq    r8, r28, r8
247 L(cj4): umulh   r3, r19, r23    C U1
248         cmpult  r8, r4, r28
249         subq    r8, r4, r8
250         addq    r20, r28, r28
251         stq     r8, 24(r16)
252         subq    r8, r28, r8
253 L(cj3): cmpult  r8, r5, r28
254         subq    r8, r5, r8
255         addq    r21, r28, r28
256         stq     r8, 32(r16)
257         subq    r8, r28, r8
258 L(cj2): cmpult  r8, r6, r28
259         subq    r8, r6, r8
260         addq    r22, r28, r28
261         stq     r8, 40(r16)
262         subq    r8, r28, r8
263 L(cj1): cmpult  r8, r7, r28
264         subq    r8, r7, r8
265         addq    r23, r28, r28
266         stq     r8, 48(r16)
267         subq    r8, r28, r0
268         ret     r31, (r26), 1
269
270 EPILOGUE()
271 ASM_END()