2 Copyright (c) 2013, Raspberry Pi Foundation
3 Copyright (c) 2013, RISC OS Open Ltd
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of the copyright holder nor the
14 names of its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
21 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 .macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
32 mov r1, r0, lsl #32-align*8
34 orr r1, r1, r0, lsr #align*8
37 mov r0, r1, lsr #align*8
39 orr r0, r0, r1, lsl #32-align*8
45 mov r2, r0, lsl #32-align*8
47 orr r2, r2, r1, lsr #align*8
48 mov r1, r1, lsl #32-align*8
49 orr r1, r1, r0, lsr #align*8
53 mov r0, r2, lsr #align*8
55 orr r0, r0, r1, lsl #32-align*8
56 mov r1, r1, lsr #align*8
57 orr r1, r1, r2, lsl #32-align*8
63 mov r4, r0, lsl #32-align*8
65 orr r4, r4, r3, lsr #align*8
66 mov r3, r3, lsl #32-align*8
67 orr r3, r3, r2, lsr #align*8
68 mov r2, r2, lsl #32-align*8
69 orr r2, r2, r1, lsr #align*8
70 mov r1, r1, lsl #32-align*8
71 orr r1, r1, r0, lsr #align*8
72 stmdb D!, {r1, r2, r3, r4}
75 mov r0, r4, lsr #align*8
77 orr r0, r0, r1, lsl #32-align*8
78 mov r1, r1, lsr #align*8
79 orr r1, r1, r2, lsl #32-align*8
80 mov r2, r2, lsr #align*8
81 orr r2, r2, r3, lsl #32-align*8
82 mov r3, r3, lsr #align*8
83 orr r3, r3, r4, lsl #32-align*8
84 stmia D!, {r0, r1, r2, r3}
88 ldmdb S!, {r4, r5, r6, r7}
89 mov r8, r0, lsl #32-align*8
90 ldmdb S!, {r0, r1, r2, r3}
94 orr r8, r8, r7, lsr #align*8
95 mov r7, r7, lsl #32-align*8
96 orr r7, r7, r6, lsr #align*8
97 mov r6, r6, lsl #32-align*8
98 orr r6, r6, r5, lsr #align*8
99 mov r5, r5, lsl #32-align*8
100 orr r5, r5, r4, lsr #align*8
101 mov r4, r4, lsl #32-align*8
102 orr r4, r4, r3, lsr #align*8
103 mov r3, r3, lsl #32-align*8
104 orr r3, r3, r2, lsr #align*8
105 mov r2, r2, lsl #32-align*8
106 orr r2, r2, r1, lsr #align*8
107 mov r1, r1, lsl #32-align*8
108 orr r1, r1, r0, lsr #align*8
109 stmdb D!, {r5, r6, r7, r8}
110 stmdb D!, {r1, r2, r3, r4}
112 ldmib S!, {r1, r2, r3, r4}
113 mov r0, r8, lsr #align*8
114 ldmib S!, {r5, r6, r7, r8}
118 orr r0, r0, r1, lsl #32-align*8
119 mov r1, r1, lsr #align*8
120 orr r1, r1, r2, lsl #32-align*8
121 mov r2, r2, lsr #align*8
122 orr r2, r2, r3, lsl #32-align*8
123 mov r3, r3, lsr #align*8
124 orr r3, r3, r4, lsl #32-align*8
125 mov r4, r4, lsr #align*8
126 orr r4, r4, r5, lsl #32-align*8
127 mov r5, r5, lsr #align*8
128 orr r5, r5, r6, lsl #32-align*8
129 mov r6, r6, lsr #align*8
130 orr r6, r6, r7, lsl #32-align*8
131 mov r7, r7, lsr #align*8
132 orr r7, r7, r8, lsl #32-align*8
133 stmia D!, {r0, r1, r2, r3}
134 stmia D!, {r4, r5, r6, r7}
139 .macro memcpy_leading_15bytes backwards, align
140 movs DAT1, DAT2, lsl #31
143 ldrmib DAT0, [S, #-1]!
144 ldrcsh DAT1, [S, #-2]!
145 strmib DAT0, [D, #-1]!
146 strcsh DAT1, [D, #-2]!
153 movs DAT1, DAT2, lsl #29
155 ldrmi DAT0, [S, #-4]!
157 ldmcsdb S!, {DAT1, DAT2}
159 ldrcs DAT2, [S, #-4]!
160 ldrcs DAT1, [S, #-4]!
162 strmi DAT0, [D, #-4]!
163 stmcsdb D!, {DAT1, DAT2}
167 ldmcsia S!, {DAT1, DAT2}
173 stmcsia D!, {DAT1, DAT2}
177 .macro memcpy_trailing_15bytes backwards, align
181 ldmcsdb S!, {DAT0, DAT1}
183 ldrcs DAT1, [S, #-4]!
184 ldrcs DAT0, [S, #-4]!
186 ldrmi DAT2, [S, #-4]!
187 stmcsdb D!, {DAT0, DAT1}
188 strmi DAT2, [D, #-4]!
191 ldmcsia S!, {DAT0, DAT1}
197 stmcsia D!, {DAT0, DAT1}
202 ldrcsh DAT0, [S, #-2]!
203 ldrmib DAT1, [S, #-1]
204 strcsh DAT0, [D, #-2]!
205 strmib DAT1, [D, #-1]
214 .macro memcpy_long_inner_loop backwards, align
217 ldr DAT0, [S, #-align]!
219 ldr LAST, [S, #-align]!
225 ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
227 stmdb D!, {DAT4, DAT5, DAT6, LAST}
228 stmdb D!, {DAT0, DAT1, DAT2, DAT3}
230 ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
232 stmia D!, {DAT0, DAT1, DAT2, DAT3}
233 stmia D!, {DAT4, DAT5, DAT6, LAST}
236 unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
240 /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
241 preload_trailing backwards, S, N, OFF
242 add N, N, #(prefetch_distance+2)*32 - 32
246 ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
247 stmdb D!, {DAT4, DAT5, DAT6, LAST}
248 stmdb D!, {DAT0, DAT1, DAT2, DAT3}
250 ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
251 stmia D!, {DAT0, DAT1, DAT2, DAT3}
252 stmia D!, {DAT4, DAT5, DAT6, LAST}
255 unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
262 ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
263 stmnedb D!, {DAT0, DAT1, DAT2, LAST}
265 ldmneia S!, {DAT0, DAT1, DAT2, LAST}
266 stmneia D!, {DAT0, DAT1, DAT2, LAST}
270 unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
273 /* Trailing words and bytes */
279 memcpy_trailing_15bytes backwards, align
281 pop {DAT3, DAT4, DAT5, DAT6, DAT7}
282 pop {D, DAT1, DAT2, pc}
285 .macro memcpy_medium_inner_loop backwards, align
289 ldmdb S!, {DAT0, DAT1, DAT2, LAST}
296 stmdb D!, {DAT0, DAT1, DAT2, LAST}
299 ldmia S!, {DAT0, DAT1, DAT2, LAST}
306 stmia D!, {DAT0, DAT1, DAT2, LAST}
310 /* Trailing words and bytes */
313 memcpy_trailing_15bytes backwards, align
315 pop {D, DAT1, DAT2, pc}
318 .macro memcpy_short_inner_loop backwards, align
322 ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
324 ldrne LAST, [S, #-4]!
325 ldrne DAT2, [S, #-4]!
326 ldrne DAT1, [S, #-4]!
327 ldrne DAT0, [S, #-4]!
329 stmnedb D!, {DAT0, DAT1, DAT2, LAST}
332 ldmneia S!, {DAT0, DAT1, DAT2, LAST}
339 stmneia D!, {DAT0, DAT1, DAT2, LAST}
341 memcpy_trailing_15bytes backwards, align
343 pop {D, DAT1, DAT2, pc}
346 .macro memcpy backwards
363 push {D, DAT1, DAT2, lr}
367 UNWIND( .save {D, DAT1, DAT2, lr} )
374 /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
377 /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
378 cmp N, #(prefetch_distance+3)*32 - 1
382 push {DAT3, DAT4, DAT5, DAT6, DAT7}
386 UNWIND( .save {D, DAT1, DAT2, lr} )
387 UNWIND( .save {DAT3, DAT4, DAT5, DAT6, DAT7} )
389 /* Adjust N so that the decrement instruction can also test for
390 * inner loop termination. We want it to stop when there are
391 * (prefetch_distance+1) complete blocks to go. */
392 sub N, N, #(prefetch_distance+2)*32
393 preload_leading_step1 backwards, DAT0, S
395 /* Bug in GAS: it accepts, but mis-assembles the instruction
396 * ands DAT2, D, #60, 2
397 * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
404 rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
406 preload_leading_step2 backwards, DAT0, S, DAT2, OFF
407 memcpy_leading_15bytes backwards, 1
408 154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
409 /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
413 sub OFF, OFF, #32*(prefetch_distance+1)
416 rsb OFF, OFF, #32*prefetch_distance
418 movs DAT0, S, lsl #31
422 memcpy_long_inner_loop backwards, 0
423 155: memcpy_long_inner_loop backwards, 1
424 156: memcpy_long_inner_loop backwards, 2
425 157: memcpy_long_inner_loop backwards, 3
430 UNWIND( .save {D, DAT1, DAT2, lr} )
432 160: /* Medium case */
433 preload_all backwards, 0, 0, S, N, DAT2, OFF
434 sub N, N, #16 /* simplifies inner loop termination */
443 memcpy_leading_15bytes backwards, align
444 164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
447 memcpy_medium_inner_loop backwards, 0
448 140: memcpy_medium_inner_loop backwards, 1
450 170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
453 preload_all backwards, 1, 0, S, N, DAT2, LAST
467 174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
470 memcpy_short_inner_loop backwards, 0
471 140: memcpy_short_inner_loop backwards, 1