Upload Tizen:Base source
[external/gmp.git] / mpn / alpha / ev6 / mul_1.asm
1 dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2 dnl  result in a second limb vector.
3
4 dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
5
6 dnl  This file is part of the GNU MP Library.
7
8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl  it under the terms of the GNU Lesser General Public License as published
10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
11 dnl  your option) any later version.
12
13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 dnl  License for more details.
17
18 dnl  You should have received a copy of the GNU Lesser General Public License
19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23 C INPUT PARAMETERS
24 C res_ptr       r16
25 C s1_ptr        r17
26 C size          r18
27 C s2_limb       r19
28
29 C This code runs at 2.25 cycles/limb on EV6.
30
31 C This code was written in close cooperation with ev6 pipeline expert
32 C Steve Root.  Any errors are tege's fault, though.
33
34 C Code structure:
35
36 C  code for n < 8
37 C  code for n > 8       code for (n mod 8)
38 C                       code for (n div 8)      feed-in code
39 C                                               8-way unrolled loop
40 C                                               wind-down code
41
42 C Some notes about unrolled loop:
43 C
44 C   r1-r8     multiplies and workup
45 C   r21-r28   multiplies and workup
46 C   r9-r12    loads
47 C   r0       -1
48 C   r20,r29,r13-r15  scramble
49 C
50 C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
51 C   put-the-carry-into-hi.  The idea is that these branches are very rarely
52 C   taken, and since a non-taken branch consumes no resources, that is better
53 C   than an addq.
54 C
55 C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
56 C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
57
58 C The code could use some further work:
59 C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
60 C      faster than this for size < 3.
61 C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
62 C      that is too costly.
63 C   3. Consider using 4-way unrolling, even if that runs slower.
64 C   4. Reduce register usage.  In particular, try to avoid using r29.
65
66 ASM_START()
67 PROLOGUE(mpn_mul_1)
68         cmpult  r18,    8,      r1
69         beq     r1,     $Large
70 $Lsmall:
71         ldq     r2,0(r17)       C r2 = s1_limb
72         lda     r18,-1(r18)     C size--
73         mulq    r2,r19,r3       C r3 = prod_low
74         bic     r31,r31,r4      C clear cy_limb
75         umulh   r2,r19,r0       C r0 = prod_high
76         beq     r18,$Le1a       C jump if size was == 1
77         ldq     r2,8(r17)       C r2 = s1_limb
78         lda     r18,-1(r18)     C size--
79         stq     r3,0(r16)
80         beq     r18,$Le2a       C jump if size was == 2
81         ALIGN(8)
82 $Lopa:  mulq    r2,r19,r3       C r3 = prod_low
83         addq    r4,r0,r0        C cy_limb = cy_limb + 'cy'
84         lda     r18,-1(r18)     C size--
85         umulh   r2,r19,r4       C r4 = cy_limb
86         ldq     r2,16(r17)      C r2 = s1_limb
87         lda     r17,8(r17)      C s1_ptr++
88         addq    r3,r0,r3        C r3 = cy_limb + prod_low
89         stq     r3,8(r16)
90         cmpult  r3,r0,r0        C r0 = carry from (cy_limb + prod_low)
91         lda     r16,8(r16)      C res_ptr++
92         bne     r18,$Lopa
93
94 $Le2a:  mulq    r2,r19,r3       C r3 = prod_low
95         addq    r4,r0,r0        C cy_limb = cy_limb + 'cy'
96         umulh   r2,r19,r4       C r4 = cy_limb
97         addq    r3,r0,r3        C r3 = cy_limb + prod_low
98         cmpult  r3,r0,r0        C r0 = carry from (cy_limb + prod_low)
99         stq     r3,8(r16)
100         addq    r4,r0,r0        C cy_limb = prod_high + cy
101         ret     r31,(r26),1
102 $Le1a:  stq     r3,0(r16)
103         ret     r31,(r26),1
104
105 $Large:
106         lda     r30,    -224(r30)
107         stq     r26,    0(r30)
108         stq     r9,     8(r30)
109         stq     r10,    16(r30)
110         stq     r11,    24(r30)
111         stq     r12,    32(r30)
112         stq     r13,    40(r30)
113         stq     r14,    48(r30)
114         stq     r15,    56(r30)
115         stq     r29,    64(r30)
116
117         and     r18,    7,      r20     C count for the first loop, 0-7
118         srl     r18,    3,      r18     C count for unrolled loop
119         bis     r31,    r31,    r21
120         beq     r20,    $L_8_or_more    C skip first loop
121
122 $L_9_or_more:
123         ldq     r2,0(r17)       C r2 = s1_limb
124         lda     r17,8(r17)      C s1_ptr++
125         lda     r20,-1(r20)     C size--
126         mulq    r2,r19,r3       C r3 = prod_low
127         umulh   r2,r19,r21      C r21 = prod_high
128         beq     r20,$Le1b       C jump if size was == 1
129         bis     r31, r31, r0    C FIXME: shouldn't need this
130         ldq     r2,0(r17)       C r2 = s1_limb
131         lda     r17,8(r17)      C s1_ptr++
132         lda     r20,-1(r20)     C size--
133         stq     r3,0(r16)
134         lda     r16,8(r16)      C res_ptr++
135         beq     r20,$Le2b       C jump if size was == 2
136         ALIGN(8)
137 $Lopb:  mulq    r2,r19,r3       C r3 = prod_low
138         addq    r21,r0,r0       C cy_limb = cy_limb + 'cy'
139         lda     r20,-1(r20)     C size--
140         umulh   r2,r19,r21      C r21 = prod_high
141         ldq     r2,0(r17)       C r2 = s1_limb
142         lda     r17,8(r17)      C s1_ptr++
143         addq    r3,r0,r3        C r3 = cy_limb + prod_low
144         stq     r3,0(r16)
145         cmpult  r3,r0,r0        C r0 = carry from (cy_limb + prod_low)
146         lda     r16,8(r16)      C res_ptr++
147         bne     r20,$Lopb
148
149 $Le2b:  mulq    r2,r19,r3       C r3 = prod_low
150         addq    r21,r0,r0       C cy_limb = cy_limb + 'cy'
151         umulh   r2,r19,r21      C r21 = prod_high
152         addq    r3,r0,r3        C r3 = cy_limb + prod_low
153         cmpult  r3,r0,r0        C r0 = carry from (cy_limb + prod_low)
154         stq     r3,0(r16)
155         lda     r16,8(r16)      C res_ptr++
156         addq    r21,r0,r21      C cy_limb = prod_high + cy
157         br      r31,    $L_8_or_more
158 $Le1b:  stq     r3,0(r16)
159         lda     r16,8(r16)      C res_ptr++
160
161 $L_8_or_more:
162         lda     r0,     -1(r31)         C put -1 in r0, for tricky loop control
163         lda     r17,    -32(r17)        C L1 bookkeeping
164         lda     r18,    -1(r18)         C decrement count
165
166         ldq     r9,     32(r17)         C L1
167         ldq     r10,    40(r17)         C L1
168         mulq    r9,     r19,    r22     C U1 #07
169         ldq     r11,    48(r17)         C L1
170         umulh   r9,     r19,    r23     C U1 #08
171         ldq     r12,    56(r17)         C L1
172         mulq    r10,    r19,    r24     C U1 #09
173         ldq     r9,     64(r17)         C L1
174
175         lda     r17,    64(r17)         C L1 bookkeeping
176
177         umulh   r10,    r19,    r25     C U1 #11
178         mulq    r11,    r19,    r26     C U1 #12
179         umulh   r11,    r19,    r27     C U1 #13
180         mulq    r12,    r19,    r28     C U1 #14
181         ldq     r10,    8(r17)          C L1
182         umulh   r12,    r19,    r1      C U1 #15
183         ldq     r11,    16(r17)         C L1
184         mulq    r9,     r19,    r2      C U1 #16
185         ldq     r12,    24(r17)         C L1
186         umulh   r9,     r19,    r3      C U1 #17
187         addq    r21,    r22,    r13     C L1 mov
188         mulq    r10,    r19,    r4      C U1 #18
189         addq    r23,    r24,    r22     C L0 sum 2 mul's
190         cmpult  r13,    r21,    r14     C L1 carry from sum
191         bgt     r18,    $L_16_or_more
192
193         cmpult  r22,    r24,    r24     C U0 carry from sum
194         umulh   r10,    r19,    r5      C U1 #02
195         addq    r25,    r26,    r23     C U0 sum 2 mul's
196         mulq    r11,    r19,    r6      C U1 #03
197         cmpult  r23,    r26,    r25     C U0 carry from sum
198         umulh   r11,    r19,    r7      C U1 #04
199         addq    r27,    r28,    r28     C U0 sum 2 mul's
200         mulq    r12,    r19,    r8      C U1 #05
201         cmpult  r28,    r27,    r15     C L0 carry from sum
202         lda     r16,    32(r16)         C L1 bookkeeping
203         addq    r13,    r31,    r13     C U0 start carry cascade
204         umulh   r12,    r19,    r21     C U1 #06
205         br      r31,    $ret0c
206
207 $L_16_or_more:
208 C ---------------------------------------------------------------
209         subq    r18,1,r18
210         cmpult  r22,    r24,    r24     C U0 carry from sum
211         ldq     r9,     32(r17)         C L1
212
213         umulh   r10,    r19,    r5      C U1 #02
214         addq    r25,    r26,    r23     C U0 sum 2 mul's
215         mulq    r11,    r19,    r6      C U1 #03
216         cmpult  r23,    r26,    r25     C U0 carry from sum
217         umulh   r11,    r19,    r7      C U1 #04
218         addq    r27,    r28,    r28     C U0 sum 2 mul's
219         mulq    r12,    r19,    r8      C U1 #05
220         cmpult  r28,    r27,    r15     C L0 carry from sum
221         lda     r16,    32(r16)         C L1 bookkeeping
222         addq    r13,    r31,    r13     C U0 start carry cascade
223
224         umulh   r12,    r19,    r21     C U1 #06
225 C       beq     r13,    $fix0w          C U0
226 $ret0w: addq    r22,    r14,    r26     C L0
227         ldq     r10,    40(r17)         C L1
228
229         mulq    r9,     r19,    r22     C U1 #07
230         beq     r26,    $fix1w          C U0
231 $ret1w: addq    r23,    r24,    r27     C L0
232         ldq     r11,    48(r17)         C L1
233
234         umulh   r9,     r19,    r23     C U1 #08
235         beq     r27,    $fix2w          C U0
236 $ret2w: addq    r28,    r25,    r28     C L0
237         ldq     r12,    56(r17)         C L1
238
239         mulq    r10,    r19,    r24     C U1 #09
240         beq     r28,    $fix3w          C U0
241 $ret3w: addq    r1,     r2,     r20     C L0 sum 2 mul's
242         ldq     r9,     64(r17)         C L1
243
244         addq    r3,     r4,     r2      C L0 #10 2 mul's
245         lda     r17,    64(r17)         C L1 bookkeeping
246         cmpult  r20,    r1,     r29     C U0 carry from sum
247
248         umulh   r10,    r19,    r25     C U1 #11
249         cmpult  r2,     r4,     r4      C U0 carry from sum
250         stq     r13,    -32(r16)        C L0
251         stq     r26,    -24(r16)        C L1
252
253         mulq    r11,    r19,    r26     C U1 #12
254         addq    r5,     r6,     r14     C U0 sum 2 mul's
255         stq     r27,    -16(r16)        C L0
256         stq     r28,    -8(r16)         C L1
257
258         umulh   r11,    r19,    r27     C U1 #13
259         cmpult  r14,    r6,     r3      C U0 carry from sum
260 C could do cross-jumping here:
261 C       bra     $L_middle_of_unrolled_loop
262         mulq    r12,    r19,    r28     C U1 #14
263         addq    r7,     r3,     r5      C L0 eat carry
264         addq    r20,    r15,    r20     C U0 carry cascade
265         ldq     r10,    8(r17)          C L1
266
267         umulh   r12,    r19,    r1      C U1 #15
268         beq     r20,    $fix4           C U0
269 $ret4w: addq    r2,     r29,    r6      C L0
270         ldq     r11,    16(r17)         C L1
271
272         mulq    r9,     r19,    r2      C U1 #16
273         beq     r6,     $fix5           C U0
274 $ret5w: addq    r14,    r4,     r7      C L0
275         ldq     r12,    24(r17)         C L1
276
277         umulh   r9,     r19,    r3      C U1 #17
278         beq     r7,     $fix6           C U0
279 $ret6w: addq    r5,     r8,     r8      C L0 sum 2
280         addq    r21,    r22,    r13     C L1 sum 2 mul's
281
282         mulq    r10,    r19,    r4      C U1 #18
283         addq    r23,    r24,    r22     C L0 sum 2 mul's
284         cmpult  r13,    r21,    r14     C L1 carry from sum
285         ble     r18,    $Lend           C U0
286 C ---------------------------------------------------------------
287         ALIGN(16)
288 $Loop:
289         umulh   r0,     r18,    r18     C U1 #01 decrement r18!
290         cmpult  r8,     r5,     r29     C L0 carry from last bunch
291         cmpult  r22,    r24,    r24     C U0 carry from sum
292         ldq     r9,     32(r17)         C L1
293
294         umulh   r10,    r19,    r5      C U1 #02
295         addq    r25,    r26,    r23     C U0 sum 2 mul's
296         stq     r20,    0(r16)          C L0
297         stq     r6,     8(r16)          C L1
298
299         mulq    r11,    r19,    r6      C U1 #03
300         cmpult  r23,    r26,    r25     C U0 carry from sum
301         stq     r7,     16(r16)         C L0
302         stq     r8,     24(r16)         C L1
303
304         umulh   r11,    r19,    r7      C U1 #04
305         bis     r31,    r31,    r31     C L0 st slosh
306         bis     r31,    r31,    r31     C L1 st slosh
307         addq    r27,    r28,    r28     C U0 sum 2 mul's
308
309         mulq    r12,    r19,    r8      C U1 #05
310         cmpult  r28,    r27,    r15     C L0 carry from sum
311         lda     r16,    64(r16)         C L1 bookkeeping
312         addq    r13,    r29,    r13     C U0 start carry cascade
313
314         umulh   r12,    r19,    r21     C U1 #06
315         beq     r13,    $fix0           C U0
316 $ret0:  addq    r22,    r14,    r26     C L0
317         ldq     r10,    40(r17)         C L1
318
319         mulq    r9,     r19,    r22     C U1 #07
320         beq     r26,    $fix1           C U0
321 $ret1:  addq    r23,    r24,    r27     C L0
322         ldq     r11,    48(r17)         C L1
323
324         umulh   r9,     r19,    r23     C U1 #08
325         beq     r27,    $fix2           C U0
326 $ret2:  addq    r28,    r25,    r28     C L0
327         ldq     r12,    56(r17)         C L1
328
329         mulq    r10,    r19,    r24     C U1 #09
330         beq     r28,    $fix3           C U0
331 $ret3:  addq    r1,     r2,     r20     C L0 sum 2 mul's
332         ldq     r9,     64(r17)         C L1
333
334         addq    r3,     r4,     r2      C L0 #10 2 mul's
335         bis     r31,    r31,    r31     C U1 mul hole
336         lda     r17,    64(r17)         C L1 bookkeeping
337         cmpult  r20,    r1,     r29     C U0 carry from sum
338
339         umulh   r10,    r19,    r25     C U1 #11
340         cmpult  r2,     r4,     r4      C U0 carry from sum
341         stq     r13,    -32(r16)        C L0
342         stq     r26,    -24(r16)        C L1
343
344         mulq    r11,    r19,    r26     C U1 #12
345         addq    r5,     r6,     r14     C U0 sum 2 mul's
346         stq     r27,    -16(r16)        C L0
347         stq     r28,    -8(r16)         C L1
348
349         umulh   r11,    r19,    r27     C U1 #13
350         bis     r31,    r31,    r31     C L0 st slosh
351         bis     r31,    r31,    r31     C L1 st slosh
352         cmpult  r14,    r6,     r3      C U0 carry from sum
353 $L_middle_of_unrolled_loop:
354         mulq    r12,    r19,    r28     C U1 #14
355         addq    r7,     r3,     r5      C L0 eat carry
356         addq    r20,    r15,    r20     C U0 carry cascade
357         ldq     r10,    8(r17)          C L1
358
359         umulh   r12,    r19,    r1      C U1 #15
360         beq     r20,    $fix4           C U0
361 $ret4:  addq    r2,     r29,    r6      C L0
362         ldq     r11,    16(r17)         C L1
363
364         mulq    r9,     r19,    r2      C U1 #16
365         beq     r6,     $fix5           C U0
366 $ret5:  addq    r14,    r4,     r7      C L0
367         ldq     r12,    24(r17)         C L1
368
369         umulh   r9,     r19,    r3      C U1 #17
370         beq     r7,     $fix6           C U0
371 $ret6:  addq    r5,     r8,     r8      C L0 sum 2
372         addq    r21,    r22,    r13     C L1 sum 2 mul's
373
374         mulq    r10,    r19,    r4      C U1 #18
375         addq    r23,    r24,    r22     C L0 sum 2 mul's
376         cmpult  r13,    r21,    r14     C L1 carry from sum
377         bgt     r18,    $Loop           C U0
378 C ---------------------------------------------------------------
379 $Lend:
380         cmpult  r8,     r5,     r29     C L0 carry from last bunch
381         cmpult  r22,    r24,    r24     C U0 carry from sum
382
383         umulh   r10,    r19,    r5      C U1 #02
384         addq    r25,    r26,    r23     C U0 sum 2 mul's
385         stq     r20,    0(r16)          C L0
386         stq     r6,     8(r16)          C L1
387
388         mulq    r11,    r19,    r6      C U1 #03
389         cmpult  r23,    r26,    r25     C U0 carry from sum
390         stq     r7,     16(r16)         C L0
391         stq     r8,     24(r16)         C L1
392
393         umulh   r11,    r19,    r7      C U1 #04
394         addq    r27,    r28,    r28     C U0 sum 2 mul's
395
396         mulq    r12,    r19,    r8      C U1 #05
397         cmpult  r28,    r27,    r15     C L0 carry from sum
398         lda     r16,    64(r16)         C L1 bookkeeping
399         addq    r13,    r29,    r13     C U0 start carry cascade
400
401         umulh   r12,    r19,    r21     C U1 #06
402         beq     r13,    $fix0c          C U0
403 $ret0c: addq    r22,    r14,    r26     C L0
404         beq     r26,    $fix1c          C U0
405 $ret1c: addq    r23,    r24,    r27     C L0
406         beq     r27,    $fix2c          C U0
407 $ret2c: addq    r28,    r25,    r28     C L0
408         beq     r28,    $fix3c          C U0
409 $ret3c: addq    r1,     r2,     r20     C L0 sum 2 mul's
410         addq    r3,     r4,     r2      C L0 #10 2 mul's
411         lda     r17,    64(r17)         C L1 bookkeeping
412         cmpult  r20,    r1,     r29     C U0 carry from sum
413         cmpult  r2,     r4,     r4      C U0 carry from sum
414         stq     r13,    -32(r16)        C L0
415         stq     r26,    -24(r16)        C L1
416         addq    r5,     r6,     r14     C U0 sum 2 mul's
417         stq     r27,    -16(r16)        C L0
418         stq     r28,    -8(r16)         C L1
419         cmpult  r14,    r6,     r3      C U0 carry from sum
420         addq    r7,     r3,     r5      C L0 eat carry
421         addq    r20,    r15,    r20     C U0 carry cascade
422         beq     r20,    $fix4c          C U0
423 $ret4c: addq    r2,     r29,    r6      C L0
424         beq     r6,     $fix5c          C U0
425 $ret5c: addq    r14,    r4,     r7      C L0
426         beq     r7,     $fix6c          C U0
427 $ret6c: addq    r5,     r8,     r8      C L0 sum 2
428         cmpult  r8,     r5,     r29     C L0 carry from last bunch
429         stq     r20,    0(r16)          C L0
430         stq     r6,     8(r16)          C L1
431         stq     r7,     16(r16)         C L0
432         stq     r8,     24(r16)         C L1
433         addq    r29,    r21,    r0
434
435         ldq     r26,    0(r30)
436         ldq     r9,     8(r30)
437         ldq     r10,    16(r30)
438         ldq     r11,    24(r30)
439         ldq     r12,    32(r30)
440         ldq     r13,    40(r30)
441         ldq     r14,    48(r30)
442         ldq     r15,    56(r30)
443         ldq     r29,    64(r30)
444         lda     r30,    224(r30)
445         ret     r31,    (r26),  1
446
447 C $fix0w:       bis     r14,    r29,    r14     C join carries
448 C       br      r31,    $ret0w
449 $fix1w: bis     r24,    r14,    r24     C join carries
450         br      r31,    $ret1w
451 $fix2w: bis     r25,    r24,    r25     C join carries
452         br      r31,    $ret2w
453 $fix3w: bis     r15,    r25,    r15     C join carries
454         br      r31,    $ret3w
455 $fix0:  bis     r14,    r29,    r14     C join carries
456         br      r31,    $ret0
457 $fix1:  bis     r24,    r14,    r24     C join carries
458         br      r31,    $ret1
459 $fix2:  bis     r25,    r24,    r25     C join carries
460         br      r31,    $ret2
461 $fix3:  bis     r15,    r25,    r15     C join carries
462         br      r31,    $ret3
463 $fix4:  bis     r29,    r15,    r29     C join carries
464         br      r31,    $ret4
465 $fix5:  bis     r4,     r29,    r4      C join carries
466         br      r31,    $ret5
467 $fix6:  addq    r5,     r4,     r5      C can't carry twice!
468         br      r31,    $ret6
469 $fix0c: bis     r14,    r29,    r14     C join carries
470         br      r31,    $ret0c
471 $fix1c: bis     r24,    r14,    r24     C join carries
472         br      r31,    $ret1c
473 $fix2c: bis     r25,    r24,    r25     C join carries
474         br      r31,    $ret2c
475 $fix3c: bis     r15,    r25,    r15     C join carries
476         br      r31,    $ret3c
477 $fix4c: bis     r29,    r15,    r29     C join carries
478         br      r31,    $ret4c
479 $fix5c: bis     r4,     r29,    r4      C join carries
480         br      r31,    $ret5c
481 $fix6c: addq    r5,     r4,     r5      C can't carry twice!
482         br      r31,    $ret6c
483
484 EPILOGUE(mpn_mul_1)
485 ASM_END()