1 dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
2 dnl store sum in a third limb vector.
4 dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
33 C cy r20 (for mpn_add_nc)
36 C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
37 C Use multi-pronged feed-in.
38 C Perform additional micro-tuning
40 C This code was written in cooperation with ev6 pipeline expert Steve Root.
42 C Pair loads and stores where possible
43 C Store pairs oct-aligned where possible (didn't need it here)
44 C Stores are delayed every third cycle
45 C Loads and stores are delayed by fills
46 C U stays still, put code there where possible (note alternation of U1 and U0)
47 C L moves because of loads and stores
48 C Note dampers in L to limit damage
50 C This odd-looking optimization expects that were having random bits in our
51 C data, so that a pure zero result is unlikely. so we penalize the unlikely
52 C case to help the common case.
54 define(`u0', `r0') define(`u1', `r3')
55 define(`v0', `r1') define(`v1', `r4')
57 define(`cy0', `r20') define(`cy1', `r21')
59 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
66 bis r31, r31, cy0 C clear carry in
67 $entry: cmpult r19, 5, r22 C L1 move counter
68 ldq u1, 0(r17) C L0 get next ones
72 ldq u0, 8(r17) C L0 get next ones
74 addq u1, v1, r5 C U0 add two data
76 cmpult r5, v1, r23 C U0 did it carry
77 ldq u1, 16(r17) C L0 get next ones
80 addq u0, v0, r8 C U1 add two data
81 addq r5, cy0, r5 C U0 carry in
83 cmpult r8, v0, r22 C U1 did it carry
84 beq r5, $fix5f C U0 fix exact zero
85 $ret5f: ldq u0, 24(r17) C L0 get next ones
88 addq r8, r23, r8 C U1 carry from last
89 addq u1, v1, r7 C U0 add two data
91 beq r8, $fix6f C U1 fix exact zero
92 $ret6f: cmpult r7, v1, r23 C U0 did it carry
93 ldq u1, 32(r17) C L0 get next ones
96 lda r17, 40(r17) C L0 move pointer
97 lda r18, 40(r18) C L1 move pointer
100 lda r19, -13(r19) C L1 move counter
101 blt r19, $Lend C U1 loop control
104 C Main loop. 8-way unrolled.
106 $Loop: addq u0, v0, r2 C U1 add two data
107 addq r7, r22, r7 C U0 add in carry
108 stq r5, 8(r16) C L0 put an answer
109 stq r8, 16(r16) C L1 pair
111 cmpult r2, v0, cy1 C U1 did it carry
112 beq r7, $fix7 C U0 fix exact 0
113 $ret7: ldq u0, 0(r17) C L0 get next ones
116 bis r31, r31, r31 C L damp out
117 addq r2, r23, r2 C U1 carry from last
118 bis r31, r31, r31 C L moves in L !
119 addq u1, v1, r5 C U0 add two data
121 beq r2, $fix0 C U1 fix exact zero
122 $ret0: cmpult r5, v1, cy0 C U0 did it carry
123 ldq u1, 8(r17) C L0 get next ones
126 addq u0, v0, r8 C U1 add two data
127 addq r5, cy1, r5 C U0 carry from last
128 stq r7, 24(r16) C L0 store pair
131 cmpult r8, v0, r22 C U1 did it carry
132 beq r5, $fix1 C U0 fix exact zero
133 $ret1: ldq u0, 16(r17) C L0 get next ones
136 lda r16, 64(r16) C L0 move pointer
137 addq r8, cy0, r8 C U1 carry from last
138 lda r19, -8(r19) C L1 move counter
139 addq u1, v1, r7 C U0 add two data
141 beq r8, $fix2 C U1 fix exact zero
142 $ret2: cmpult r7, v1, r23 C U0 did it carry
143 ldq u1, 24(r17) C L0 get next ones
146 addq u0, v0, r2 C U1 add two data
147 addq r7, r22, r7 C U0 add in carry
148 stq r5, -24(r16) C L0 put an answer
149 stq r8, -16(r16) C L1 pair
151 cmpult r2, v0, cy1 C U1 did it carry
152 beq r7, $fix3 C U0 fix exact 0
153 $ret3: ldq u0, 32(r17) C L0 get next ones
156 bis r31, r31, r31 C L damp out
157 addq r2, r23, r2 C U1 carry from last
158 bis r31, r31, r31 C L moves in L !
159 addq u1, v1, r5 C U0 add two data
161 beq r2, $fix4 C U1 fix exact zero
162 $ret4: cmpult r5, v1, cy0 C U0 did it carry
163 ldq u1, 40(r17) C L0 get next ones
166 addq u0, v0, r8 C U1 add two data
167 addq r5, cy1, r5 C U0 carry from last
168 stq r7, -8(r16) C L0 store pair
171 cmpult r8, v0, r22 C U1 did it carry
172 beq r5, $fix5 C U0 fix exact zero
173 $ret5: ldq u0, 48(r17) C L0 get next ones
176 ldl r31, 256(r17) C L0 prefetch
177 addq r8, cy0, r8 C U1 carry from last
178 ldl r31, 256(r18) C L1 prefetch
179 addq u1, v1, r7 C U0 add two data
181 beq r8, $fix6 C U1 fix exact zero
182 $ret6: cmpult r7, v1, r23 C U0 did it carry
183 ldq u1, 56(r17) C L0 get next ones
186 lda r17, 64(r17) C L0 move pointer
187 bis r31, r31, r31 C U
188 lda r18, 64(r18) C L1 move pointer
189 bge r19, $Loop C U1 loop control
192 $Lend: addq u0, v0, r2 C U1 add two data
193 addq r7, r22, r7 C U0 add in carry
194 stq r5, 8(r16) C L0 put an answer
195 stq r8, 16(r16) C L1 pair
196 cmpult r2, v0, cy1 C U1 did it carry
197 beq r7, $fix7c C U0 fix exact 0
198 $ret7c: addq r2, r23, r2 C U1 carry from last
199 addq u1, v1, r5 C U0 add two data
200 beq r2, $fix0c C U1 fix exact zero
201 $ret0c: cmpult r5, v1, cy0 C U0 did it carry
202 addq r5, cy1, r5 C U0 carry from last
203 stq r7, 24(r16) C L0 store pair
205 beq r5, $fix1c C U0 fix exact zero
206 $ret1c: stq r5, 40(r16) C L0 put an answer
207 lda r16, 48(r16) C L0 move pointer
219 $Loop0: addq u1, v1, r2 C main add
220 cmpult r2, v1, r8 C compute cy from last add
223 addq r2, cy0, r5 C carry add
227 cmpult r5, r2, cy0 C compute cy from last add
228 lda r19, -1(r19) C decr loop cnt
229 bis r8, cy0, cy0 C combine cy from the two adds
232 $Lend0: addq u1, v1, r2 C main add
233 addq r2, cy0, r5 C carry add
234 cmpult r2, v1, r8 C compute cy from last add
235 cmpult r5, r2, cy0 C compute cy from last add
237 bis r8, cy0, r0 C combine cy from the two adds
241 $Lret: lda r0, 0(cy0) C copy carry into return register
244 $fix5f: bis r23, cy0, r23 C bring forward carry
246 $fix6f: bis r22, r23, r22 C bring forward carry
248 $fix0: bis cy1, r23, cy1 C bring forward carry
250 $fix1: bis cy0, cy1, cy0 C bring forward carry
252 $fix2: bis r22, cy0, r22 C bring forward carry
254 $fix3: bis r23, r22, r23 C bring forward carry
256 $fix4: bis cy1, r23, cy1 C bring forward carry
258 $fix5: bis cy1, cy0, cy0 C bring forward carry
260 $fix6: bis r22, cy0, r22 C bring forward carry
262 $fix7: bis r23, r22, r23 C bring forward carry
264 $fix0c: bis cy1, r23, cy1 C bring forward carry
266 $fix1c: bis cy0, cy1, cy0 C bring forward carry
268 $fix7c: bis r23, r22, r23 C bring forward carry