1 dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
2 dnl and store difference in a third limb vector.
4 dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
33 C cy r20 (for mpn_add_nc)
36 C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
37 C Use multi-pronged feed-in.
38 C Perform additional micro-tuning
40 C This code was written in cooperation with ev6 pipeline expert Steve Root.
42 C Pair loads and stores where possible
43 C Store pairs oct-aligned where possible (didn't need it here)
44 C Stores are delayed every third cycle
45 C Loads and stores are delayed by fills
46 C U stays still, put code there where possible (note alternation of U1 and U0)
47 C L moves because of loads and stores
48 C Note dampers in L to limit damage
50 C This odd-looking optimization expects that were having random bits in our
51 C data, so that a pure zero result is unlikely. so we penalize the unlikely
52 C case to help the common case.
54 define(`u0', `r0') define(`u1', `r3')
55 define(`v0', `r1') define(`v1', `r4')
57 define(`cy0', `r20') define(`cy1', `r21')
59 MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
66 bis r31, r31, cy0 C clear carry in
67 $entry: cmpult r19, 5, r22 C L1 move counter
68 ldq u1, 0(r17) C L0 get next ones
72 ldq u0, 8(r17) C L0 get next ones
74 subq u1, v1, r5 C U0 sub two data
76 cmpult u1, v1, r23 C U0 did it borrow
77 ldq u1, 16(r17) C L0 get next ones
80 subq u0, v0, r8 C U1 sub two data
81 subq r5, cy0, r24 C U0 borrow in
83 cmpult u0, v0, r22 C U1 did it borrow
84 beq r5, $fix5f C U0 fix exact zero
85 $ret5f: ldq u0, 24(r17) C L0 get next ones
88 subq r8, r23, r25 C U1 borrow from last
89 subq u1, v1, r7 C U0 sub two data
91 beq r8, $fix6f C U1 fix exact zero
92 $ret6f: cmpult u1, v1, r23 C U0 did it borrow
93 ldq u1, 32(r17) C L0 get next ones
96 lda r17, 40(r17) C L0 move pointer
97 lda r18, 40(r18) C L1 move pointer
100 lda r19, -13(r19) C L1 move counter
101 blt r19, $Lend C U1 loop control
104 C Main loop. 8-way unrolled.
106 $Loop: subq u0, v0, r2 C U1 sub two data
107 stq r24, 8(r16) C L0 put an answer
108 subq r7, r22, r24 C U0 borrow from last
109 stq r25, 16(r16) C L1 pair
111 cmpult u0, v0, cy1 C U1 did it borrow
112 beq r7, $fix7 C U0 fix exact 0
113 $ret7: ldq u0, 0(r17) C L0 get next ones
116 bis r31, r31, r31 C L damp out
117 subq r2, r23, r25 C U1 borrow from last
118 bis r31, r31, r31 C L moves in L !
119 subq u1, v1, r5 C U0 sub two data
121 beq r2, $fix0 C U1 fix exact zero
122 $ret0: cmpult u1, v1, cy0 C U0 did it borrow
123 ldq u1, 8(r17) C L0 get next ones
126 subq u0, v0, r8 C U1 sub two data
127 stq r24, 24(r16) C L0 store pair
128 subq r5, cy1, r24 C U0 borrow from last
129 stq r25, 32(r16) C L1
131 cmpult u0, v0, r22 C U1 did it borrow
132 beq r5, $fix1 C U0 fix exact zero
133 $ret1: ldq u0, 16(r17) C L0 get next ones
136 lda r16, 64(r16) C L0 move pointer
137 subq r8, cy0, r25 C U1 borrow from last
138 lda r19, -8(r19) C L1 move counter
139 subq u1, v1, r7 C U0 sub two data
141 beq r8, $fix2 C U1 fix exact zero
142 $ret2: cmpult u1, v1, r23 C U0 did it borrow
143 ldq u1, 24(r17) C L0 get next ones
146 subq u0, v0, r2 C U1 sub two data
147 stq r24, -24(r16) C L0 put an answer
148 subq r7, r22, r24 C U0 borrow from last
149 stq r25, -16(r16) C L1 pair
151 cmpult u0, v0, cy1 C U1 did it borrow
152 beq r7, $fix3 C U0 fix exact 0
153 $ret3: ldq u0, 32(r17) C L0 get next ones
156 bis r31, r31, r31 C L damp out
157 subq r2, r23, r25 C U1 borrow from last
158 bis r31, r31, r31 C L moves in L !
159 subq u1, v1, r5 C U0 sub two data
161 beq r2, $fix4 C U1 fix exact zero
162 $ret4: cmpult u1, v1, cy0 C U0 did it borrow
163 ldq u1, 40(r17) C L0 get next ones
166 subq u0, v0, r8 C U1 sub two data
167 stq r24, -8(r16) C L0 store pair
168 subq r5, cy1, r24 C U0 borrow from last
171 cmpult u0, v0, r22 C U1 did it borrow
172 beq r5, $fix5 C U0 fix exact zero
173 $ret5: ldq u0, 48(r17) C L0 get next ones
176 ldl r31, 256(r17) C L0 prefetch
177 subq r8, cy0, r25 C U1 borrow from last
178 ldl r31, 256(r18) C L1 prefetch
179 subq u1, v1, r7 C U0 sub two data
181 beq r8, $fix6 C U1 fix exact zero
182 $ret6: cmpult u1, v1, r23 C U0 did it borrow
183 ldq u1, 56(r17) C L0 get next ones
186 lda r17, 64(r17) C L0 move pointer
187 bis r31, r31, r31 C U
188 lda r18, 64(r18) C L1 move pointer
189 bge r19, $Loop C U1 loop control
192 $Lend: subq u0, v0, r2 C U1 sub two data
193 stq r24, 8(r16) C L0 put an answer
194 subq r7, r22, r24 C U0 borrow from last
195 stq r25, 16(r16) C L1 pair
196 cmpult u0, v0, cy1 C U1 did it borrow
197 beq r7, $fix7c C U0 fix exact 0
198 $ret7c: subq r2, r23, r25 C U1 borrow from last
199 subq u1, v1, r5 C U0 sub two data
200 beq r2, $fix0c C U1 fix exact zero
201 $ret0c: cmpult u1, v1, cy0 C U0 did it borrow
202 stq r24, 24(r16) C L0 store pair
203 subq r5, cy1, r24 C U0 borrow from last
204 stq r25, 32(r16) C L1
205 beq r5, $fix1c C U0 fix exact zero
206 $ret1c: stq r24, 40(r16) C L0 put an answer
207 lda r16, 48(r16) C L0 move pointer
219 $Loop0: subq u1, v1, r2 C main sub
220 cmpult u1, v1, r8 C compute bw from last sub
223 subq r2, cy0, r5 C borrow sub
227 cmpult r2, cy0, cy0 C compute bw from last sub
228 lda r19, -1(r19) C decr loop cnt
229 bis r8, cy0, cy0 C combine bw from the two subs
232 $Lend0: subq u1, v1, r2 C main sub
233 subq r2, cy0, r5 C borrow sub
234 cmpult u1, v1, r8 C compute bw from last sub
235 cmpult r2, cy0, cy0 C compute bw from last sub
237 bis r8, cy0, r0 C combine bw from the two subs
241 $Lret: lda r0, 0(cy0) C copy borrow into return register
244 $fix5f: bis r23, cy0, r23 C bring forward borrow
246 $fix6f: bis r22, r23, r22 C bring forward borrow
248 $fix0: bis cy1, r23, cy1 C bring forward borrow
250 $fix1: bis cy0, cy1, cy0 C bring forward borrow
252 $fix2: bis r22, cy0, r22 C bring forward borrow
254 $fix3: bis r23, r22, r23 C bring forward borrow
256 $fix4: bis cy1, r23, cy1 C bring forward borrow
258 $fix5: bis cy1, cy0, cy0 C bring forward borrow
260 $fix6: bis r22, cy0, r22 C bring forward borrow
262 $fix7: bis r23, r22, r23 C bring forward borrow
264 $fix0c: bis cy1, r23, cy1 C bring forward borrow
266 $fix1c: bis cy0, cy1, cy0 C bring forward borrow
268 $fix7c: bis r23, r22, r23 C bring forward borrow