1 dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2 dnl result in a second limb vector.
4 dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
29 C This code runs at 2.25 cycles/limb on EV6.
31 C This code was written in close cooperation with ev6 pipeline expert
32 C Steve Root. Any errors are tege's fault, though.
37 C code for n > 8 code for (n mod 8)
38 C code for (n div 8) feed-in code
42 C Some notes about unrolled loop:
44 C r1-r8 multiplies and workup
45 C r21-r28 multiplies and workup
48 C r20,r29,r13-r15 scramble
50 C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
51 C put-the-carry-into-hi. The idea is that these branches are very rarely
52 C taken, and since a non-taken branch consumes no resources, that is better
55 C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
56 C add NEXT cycle #09 which feeds a store in NEXT cycle #02
58 C The code could use some further work:
59 C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is
60 C faster than this for size < 3.
61 C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
63 C 3. Consider using 4-way unrolling, even if that runs slower.
64 C 4. Reduce register usage. In particular, try to avoid using r29.
71 ldq r2,0(r17) C r2 = s1_limb
72 lda r18,-1(r18) C size--
73 mulq r2,r19,r3 C r3 = prod_low
74 bic r31,r31,r4 C clear cy_limb
75 umulh r2,r19,r0 C r0 = prod_high
76 beq r18,$Le1a C jump if size was == 1
77 ldq r2,8(r17) C r2 = s1_limb
78 lda r18,-1(r18) C size--
80 beq r18,$Le2a C jump if size was == 2
82 $Lopa: mulq r2,r19,r3 C r3 = prod_low
83 addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
84 lda r18,-1(r18) C size--
85 umulh r2,r19,r4 C r4 = cy_limb
86 ldq r2,16(r17) C r2 = s1_limb
87 lda r17,8(r17) C s1_ptr++
88 addq r3,r0,r3 C r3 = cy_limb + prod_low
90 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
91 lda r16,8(r16) C res_ptr++
94 $Le2a: mulq r2,r19,r3 C r3 = prod_low
95 addq r4,r0,r0 C cy_limb = cy_limb + 'cy'
96 umulh r2,r19,r4 C r4 = cy_limb
97 addq r3,r0,r3 C r3 = cy_limb + prod_low
98 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
100 addq r4,r0,r0 C cy_limb = prod_high + cy
117 and r18, 7, r20 C count for the first loop, 0-7
118 srl r18, 3, r18 C count for unrolled loop
120 beq r20, $L_8_or_more C skip first loop
123 ldq r2,0(r17) C r2 = s1_limb
124 lda r17,8(r17) C s1_ptr++
125 lda r20,-1(r20) C size--
126 mulq r2,r19,r3 C r3 = prod_low
127 umulh r2,r19,r21 C r21 = prod_high
128 beq r20,$Le1b C jump if size was == 1
129 bis r31, r31, r0 C FIXME: shouldn't need this
130 ldq r2,0(r17) C r2 = s1_limb
131 lda r17,8(r17) C s1_ptr++
132 lda r20,-1(r20) C size--
134 lda r16,8(r16) C res_ptr++
135 beq r20,$Le2b C jump if size was == 2
137 $Lopb: mulq r2,r19,r3 C r3 = prod_low
138 addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
139 lda r20,-1(r20) C size--
140 umulh r2,r19,r21 C r21 = prod_high
141 ldq r2,0(r17) C r2 = s1_limb
142 lda r17,8(r17) C s1_ptr++
143 addq r3,r0,r3 C r3 = cy_limb + prod_low
145 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
146 lda r16,8(r16) C res_ptr++
149 $Le2b: mulq r2,r19,r3 C r3 = prod_low
150 addq r21,r0,r0 C cy_limb = cy_limb + 'cy'
151 umulh r2,r19,r21 C r21 = prod_high
152 addq r3,r0,r3 C r3 = cy_limb + prod_low
153 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low)
155 lda r16,8(r16) C res_ptr++
156 addq r21,r0,r21 C cy_limb = prod_high + cy
159 lda r16,8(r16) C res_ptr++
162 lda r0, -1(r31) C put -1 in r0, for tricky loop control
163 lda r17, -32(r17) C L1 bookkeeping
164 lda r18, -1(r18) C decrement count
167 ldq r10, 40(r17) C L1
168 mulq r9, r19, r22 C U1 #07
169 ldq r11, 48(r17) C L1
170 umulh r9, r19, r23 C U1 #08
171 ldq r12, 56(r17) C L1
172 mulq r10, r19, r24 C U1 #09
175 lda r17, 64(r17) C L1 bookkeeping
177 umulh r10, r19, r25 C U1 #11
178 mulq r11, r19, r26 C U1 #12
179 umulh r11, r19, r27 C U1 #13
180 mulq r12, r19, r28 C U1 #14
182 umulh r12, r19, r1 C U1 #15
183 ldq r11, 16(r17) C L1
184 mulq r9, r19, r2 C U1 #16
185 ldq r12, 24(r17) C L1
186 umulh r9, r19, r3 C U1 #17
187 addq r21, r22, r13 C L1 mov
188 mulq r10, r19, r4 C U1 #18
189 addq r23, r24, r22 C L0 sum 2 mul's
190 cmpult r13, r21, r14 C L1 carry from sum
191 bgt r18, $L_16_or_more
193 cmpult r22, r24, r24 C U0 carry from sum
194 umulh r10, r19, r5 C U1 #02
195 addq r25, r26, r23 C U0 sum 2 mul's
196 mulq r11, r19, r6 C U1 #03
197 cmpult r23, r26, r25 C U0 carry from sum
198 umulh r11, r19, r7 C U1 #04
199 addq r27, r28, r28 C U0 sum 2 mul's
200 mulq r12, r19, r8 C U1 #05
201 cmpult r28, r27, r15 C L0 carry from sum
202 lda r16, 32(r16) C L1 bookkeeping
203 addq r13, r31, r13 C U0 start carry cascade
204 umulh r12, r19, r21 C U1 #06
208 C ---------------------------------------------------------------
210 cmpult r22, r24, r24 C U0 carry from sum
213 umulh r10, r19, r5 C U1 #02
214 addq r25, r26, r23 C U0 sum 2 mul's
215 mulq r11, r19, r6 C U1 #03
216 cmpult r23, r26, r25 C U0 carry from sum
217 umulh r11, r19, r7 C U1 #04
218 addq r27, r28, r28 C U0 sum 2 mul's
219 mulq r12, r19, r8 C U1 #05
220 cmpult r28, r27, r15 C L0 carry from sum
221 lda r16, 32(r16) C L1 bookkeeping
222 addq r13, r31, r13 C U0 start carry cascade
224 umulh r12, r19, r21 C U1 #06
225 C beq r13, $fix0w C U0
226 $ret0w: addq r22, r14, r26 C L0
227 ldq r10, 40(r17) C L1
229 mulq r9, r19, r22 C U1 #07
231 $ret1w: addq r23, r24, r27 C L0
232 ldq r11, 48(r17) C L1
234 umulh r9, r19, r23 C U1 #08
236 $ret2w: addq r28, r25, r28 C L0
237 ldq r12, 56(r17) C L1
239 mulq r10, r19, r24 C U1 #09
241 $ret3w: addq r1, r2, r20 C L0 sum 2 mul's
244 addq r3, r4, r2 C L0 #10 2 mul's
245 lda r17, 64(r17) C L1 bookkeeping
246 cmpult r20, r1, r29 C U0 carry from sum
248 umulh r10, r19, r25 C U1 #11
249 cmpult r2, r4, r4 C U0 carry from sum
250 stq r13, -32(r16) C L0
251 stq r26, -24(r16) C L1
253 mulq r11, r19, r26 C U1 #12
254 addq r5, r6, r14 C U0 sum 2 mul's
255 stq r27, -16(r16) C L0
256 stq r28, -8(r16) C L1
258 umulh r11, r19, r27 C U1 #13
259 cmpult r14, r6, r3 C U0 carry from sum
260 C could do cross-jumping here:
261 C bra $L_middle_of_unrolled_loop
262 mulq r12, r19, r28 C U1 #14
263 addq r7, r3, r5 C L0 eat carry
264 addq r20, r15, r20 C U0 carry cascade
267 umulh r12, r19, r1 C U1 #15
269 $ret4w: addq r2, r29, r6 C L0
270 ldq r11, 16(r17) C L1
272 mulq r9, r19, r2 C U1 #16
274 $ret5w: addq r14, r4, r7 C L0
275 ldq r12, 24(r17) C L1
277 umulh r9, r19, r3 C U1 #17
279 $ret6w: addq r5, r8, r8 C L0 sum 2
280 addq r21, r22, r13 C L1 sum 2 mul's
282 mulq r10, r19, r4 C U1 #18
283 addq r23, r24, r22 C L0 sum 2 mul's
284 cmpult r13, r21, r14 C L1 carry from sum
286 C ---------------------------------------------------------------
289 umulh r0, r18, r18 C U1 #01 decrement r18!
290 cmpult r8, r5, r29 C L0 carry from last bunch
291 cmpult r22, r24, r24 C U0 carry from sum
294 umulh r10, r19, r5 C U1 #02
295 addq r25, r26, r23 C U0 sum 2 mul's
299 mulq r11, r19, r6 C U1 #03
300 cmpult r23, r26, r25 C U0 carry from sum
304 umulh r11, r19, r7 C U1 #04
305 bis r31, r31, r31 C L0 st slosh
306 bis r31, r31, r31 C L1 st slosh
307 addq r27, r28, r28 C U0 sum 2 mul's
309 mulq r12, r19, r8 C U1 #05
310 cmpult r28, r27, r15 C L0 carry from sum
311 lda r16, 64(r16) C L1 bookkeeping
312 addq r13, r29, r13 C U0 start carry cascade
314 umulh r12, r19, r21 C U1 #06
316 $ret0: addq r22, r14, r26 C L0
317 ldq r10, 40(r17) C L1
319 mulq r9, r19, r22 C U1 #07
321 $ret1: addq r23, r24, r27 C L0
322 ldq r11, 48(r17) C L1
324 umulh r9, r19, r23 C U1 #08
326 $ret2: addq r28, r25, r28 C L0
327 ldq r12, 56(r17) C L1
329 mulq r10, r19, r24 C U1 #09
331 $ret3: addq r1, r2, r20 C L0 sum 2 mul's
334 addq r3, r4, r2 C L0 #10 2 mul's
335 bis r31, r31, r31 C U1 mul hole
336 lda r17, 64(r17) C L1 bookkeeping
337 cmpult r20, r1, r29 C U0 carry from sum
339 umulh r10, r19, r25 C U1 #11
340 cmpult r2, r4, r4 C U0 carry from sum
341 stq r13, -32(r16) C L0
342 stq r26, -24(r16) C L1
344 mulq r11, r19, r26 C U1 #12
345 addq r5, r6, r14 C U0 sum 2 mul's
346 stq r27, -16(r16) C L0
347 stq r28, -8(r16) C L1
349 umulh r11, r19, r27 C U1 #13
350 bis r31, r31, r31 C L0 st slosh
351 bis r31, r31, r31 C L1 st slosh
352 cmpult r14, r6, r3 C U0 carry from sum
353 $L_middle_of_unrolled_loop:
354 mulq r12, r19, r28 C U1 #14
355 addq r7, r3, r5 C L0 eat carry
356 addq r20, r15, r20 C U0 carry cascade
359 umulh r12, r19, r1 C U1 #15
361 $ret4: addq r2, r29, r6 C L0
362 ldq r11, 16(r17) C L1
364 mulq r9, r19, r2 C U1 #16
366 $ret5: addq r14, r4, r7 C L0
367 ldq r12, 24(r17) C L1
369 umulh r9, r19, r3 C U1 #17
371 $ret6: addq r5, r8, r8 C L0 sum 2
372 addq r21, r22, r13 C L1 sum 2 mul's
374 mulq r10, r19, r4 C U1 #18
375 addq r23, r24, r22 C L0 sum 2 mul's
376 cmpult r13, r21, r14 C L1 carry from sum
378 C ---------------------------------------------------------------
380 cmpult r8, r5, r29 C L0 carry from last bunch
381 cmpult r22, r24, r24 C U0 carry from sum
383 umulh r10, r19, r5 C U1 #02
384 addq r25, r26, r23 C U0 sum 2 mul's
388 mulq r11, r19, r6 C U1 #03
389 cmpult r23, r26, r25 C U0 carry from sum
393 umulh r11, r19, r7 C U1 #04
394 addq r27, r28, r28 C U0 sum 2 mul's
396 mulq r12, r19, r8 C U1 #05
397 cmpult r28, r27, r15 C L0 carry from sum
398 lda r16, 64(r16) C L1 bookkeeping
399 addq r13, r29, r13 C U0 start carry cascade
401 umulh r12, r19, r21 C U1 #06
403 $ret0c: addq r22, r14, r26 C L0
405 $ret1c: addq r23, r24, r27 C L0
407 $ret2c: addq r28, r25, r28 C L0
409 $ret3c: addq r1, r2, r20 C L0 sum 2 mul's
410 addq r3, r4, r2 C L0 #10 2 mul's
411 lda r17, 64(r17) C L1 bookkeeping
412 cmpult r20, r1, r29 C U0 carry from sum
413 cmpult r2, r4, r4 C U0 carry from sum
414 stq r13, -32(r16) C L0
415 stq r26, -24(r16) C L1
416 addq r5, r6, r14 C U0 sum 2 mul's
417 stq r27, -16(r16) C L0
418 stq r28, -8(r16) C L1
419 cmpult r14, r6, r3 C U0 carry from sum
420 addq r7, r3, r5 C L0 eat carry
421 addq r20, r15, r20 C U0 carry cascade
423 $ret4c: addq r2, r29, r6 C L0
425 $ret5c: addq r14, r4, r7 C L0
427 $ret6c: addq r5, r8, r8 C L0 sum 2
428 cmpult r8, r5, r29 C L0 carry from last bunch
447 C $fix0w: bis r14, r29, r14 C join carries
449 $fix1w: bis r24, r14, r24 C join carries
451 $fix2w: bis r25, r24, r25 C join carries
453 $fix3w: bis r15, r25, r15 C join carries
455 $fix0: bis r14, r29, r14 C join carries
457 $fix1: bis r24, r14, r24 C join carries
459 $fix2: bis r25, r24, r25 C join carries
461 $fix3: bis r15, r25, r15 C join carries
463 $fix4: bis r29, r15, r29 C join carries
465 $fix5: bis r4, r29, r4 C join carries
467 $fix6: addq r5, r4, r5 C can't carry twice!
469 $fix0c: bis r14, r29, r14 C join carries
471 $fix1c: bis r24, r14, r24 C join carries
473 $fix2c: bis r25, r24, r25 C join carries
475 $fix3c: bis r15, r25, r15 C join carries
477 $fix4c: bis r29, r15, r29 C join carries
479 $fix5c: bis r4, r29, r4 C join carries
481 $fix6c: addq r5, r4, r5 C can't carry twice!