3 * MIPS Technologies, Inc., California.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include "pixman-mips-dspr2-asm.h"
33 * This routine could be optimized for MIPS64. The current code only
34 * uses MIPS32 instructions.
38 # define LWHI lwl /* high part is left in big-endian */
39 # define SWHI swl /* high part is left in big-endian */
40 # define LWLO lwr /* low part is right in big-endian */
41 # define SWLO swr /* low part is right in big-endian */
43 # define LWHI lwr /* high part is right in little-endian */
44 # define SWHI swr /* high part is right in little-endian */
45 # define LWLO lwl /* low part is left in big-endian */
46 # define SWLO swl /* low part is left in big-endian */
49 LEAF_MIPS32R2(pixman_mips_fast_memcpy)
53 move v0, a0 /* memcpy returns the dst pointer */
55 /* Test if the src and dst are word-aligned, or can be made word-aligned */
57 andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */
59 bne t8, zero, $unaligned
62 andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */
63 beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */
64 subu a2, a2, a3 /* now a2 is the remining bytes count */
71 /* Now the dst/src are mutually word-aligned with word-aligned addresses */
72 $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
73 /* t8 is the byte count after 64-byte chunks */
75 beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */
76 /* There will be at most 1 32-byte chunk after it */
77 subu a3, a2, t8 /* subtract from a2 the reminder */
78 /* Here a3 counts bytes in 16w chunks */
79 addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
81 addu t0, a0, a2 /* t0 is the "past the end" address */
84 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
86 * This means: for x=128 the last "safe" a0 address is "t0-160"
87 * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
88 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
90 subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
92 pref 0, 0(a1) /* bring the first line of src, addr 0 */
93 pref 0, 32(a1) /* bring the second line of src, addr 32 */
94 pref 0, 64(a1) /* bring the third line of src, addr 64 */
95 pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
96 /* In case the a0 > t9 don't use "pref 30" at all */
98 bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */
100 /* otherwise, start with using pref30 */
105 bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */
107 pref 30, 96(a0) /* continue setting up the dest, addr 96 */
115 pref 0, 128(a1) /* bring the next lines of src, addr 128 */
127 bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */
129 pref 30, 128(a0) /* continue setting up the dest, addr 128 */
137 pref 0, 160(a1) /* bring the next lines of src, addr 160 */
148 addiu a0, a0, 64 /* adding 64 to dest */
151 addiu a1, a1, 64 /* adding 64 to src */
154 /* Here we have src and dest word-aligned but less than 64-bytes to go */
158 andi t8, a2, 0x1f /* is there a 32-byte chunk? */
159 /* the t8 is the reminder count past 32-bytes */
160 beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */
184 andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
186 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
187 addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
189 /* copying in words (4-byte chunks) */
191 lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */
194 bne a0, a3, $wordCopy_loop
197 /* For the last (<8) bytes */
200 addu a3, a0, a2 /* a3 is the last dst address */
205 bne a0, a3, $last8loop
216 /* got here with a3="negu a0" */
217 andi a3, a3, 0x3 /* test if the a0 is word aligned */
219 subu a2, a2, a3 /* bytes left after initial a3 bytes */
223 addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */
225 addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */
227 $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */
228 /* t8 is the byte count after 64-byte chunks */
229 beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */
230 /* There will be at most 1 32-byte chunk after it */
231 subu a3, a2, t8 /* subtract from a2 the reminder */
232 /* Here a3 counts bytes in 16w chunks */
233 addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */
235 addu t0, a0, a2 /* t0 is the "past the end" address */
237 subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */
239 pref 0, 0(a1) /* bring the first line of src, addr 0 */
240 pref 0, 32(a1) /* bring the second line of src, addr 32 */
241 pref 0, 64(a1) /* bring the third line of src, addr 64 */
242 pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */
243 /* In case the a0 > t9 don't use "pref 30" at all */
245 bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */
247 /* otherwise, start with using pref30 */
254 bgtz v1, $ua_skip_pref30_96
256 pref 30, 96(a0) /* continue setting up the dest, addr 96 */
270 pref 0, 128(a1) /* bring the next lines of src, addr 128 */
284 bgtz v1, $ua_skip_pref30_128
286 pref 30, 128(a0) /* continue setting up the dest, addr 128 */
300 pref 0, 160(a1) /* bring the next lines of src, addr 160 */
311 addiu a0, a0, 64 /* adding 64 to dest */
313 bne a0, a3, $ua_loop16w
314 addiu a1, a1, 64 /* adding 64 to src */
317 /* Here we have src and dest word-aligned but less than 64-bytes to go */
321 andi t8, a2, 0x1f /* is there a 32-byte chunk? */
322 /* the t8 is the reminder count */
323 beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */
354 andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */
355 beq a2, t8, $ua_smallCopy
356 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */
357 addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */
359 /* copying in words (4-byte chunks) */
364 addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */
365 bne a0, a3, $ua_wordCopy_loop
368 /* Now less than 4 bytes (value in a2) left to copy */
371 addu a3, a0, a2 /* a3 is the last dst address */
376 bne a0, a3, $ua_smallCopy_loop
382 END(pixman_mips_fast_memcpy)