pixman/pixman-mips-memcpy-asm.S

   1 /*
   2  * Copyright (c) 2012
   3  *      MIPS Technologies, Inc., California.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
  14  *    contributors may be used to endorse or promote products derived from
  15  *    this software without specific prior written permission.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29
  30 #include "pixman-mips-dspr2-asm.h"
  31
  32 /*
  33  * This routine could be optimized for MIPS64. The current code only
  34  * uses MIPS32 instructions.
  35  */
  36
  37 #ifdef EB
  38 #  define LWHI  lwl             /* high part is left in big-endian */
  39 #  define SWHI  swl             /* high part is left in big-endian */
  40 #  define LWLO  lwr             /* low part is right in big-endian */
  41 #  define SWLO  swr             /* low part is right in big-endian */
  42 #else
  43 #  define LWHI  lwr             /* high part is right in little-endian */
  44 #  define SWHI  swr             /* high part is right in little-endian */
  45 #  define LWLO  lwl             /* low part is left in big-endian */
  46 #  define SWLO  swl             /* low part is left in big-endian */
  47 #endif
  48
  49 LEAF_MIPS32R2(pixman_mips_fast_memcpy)
  50
  51         slti    AT, a2, 8
  52         bne     AT, zero, $last8
  53         move    v0, a0  /* memcpy returns the dst pointer */
  54
  55 /* Test if the src and dst are word-aligned, or can be made word-aligned */
  56         xor     t8, a1, a0
  57         andi    t8, t8, 0x3             /* t8 is a0/a1 word-displacement */
  58
  59         bne     t8, zero, $unaligned
  60         negu    a3, a0
  61
  62         andi    a3, a3, 0x3     /* we need to copy a3 bytes to make a0/a1 aligned */
  63         beq     a3, zero, $chk16w       /* when a3=0 then the dst (a0) is word-aligned */
  64         subu    a2, a2, a3      /* now a2 is the remining bytes count */
  65
  66         LWHI    t8, 0(a1)
  67         addu    a1, a1, a3
  68         SWHI    t8, 0(a0)
  69         addu    a0, a0, a3
  70
  71 /* Now the dst/src are mutually word-aligned with word-aligned addresses */
  72 $chk16w:        andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
  73                                 /* t8 is the byte count after 64-byte chunks */
  74
  75         beq     a2, t8, $chk8w  /* if a2==t8, no 64-byte chunks */
  76                                 /* There will be at most 1 32-byte chunk after it */
  77         subu    a3, a2, t8      /* subtract from a2 the reminder */
  78                                 /* Here a3 counts bytes in 16w chunks */
  79         addu    a3, a0, a3      /* Now a3 is the final dst after 64-byte chunks */
  80
  81         addu    t0, a0, a2      /* t0 is the "past the end" address */
  82
  83 /*
  84  * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
  85  * the "t0-32" address
  86  * This means: for x=128 the last "safe" a0 address is "t0-160"
  87  * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
  88  * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
  89  */
  90         subu    t9, t0, 160     /* t9 is the "last safe pref 30, 128(a0)" address */
  91
  92         pref    0, 0(a1)                /* bring the first line of src, addr 0 */
  93         pref    0, 32(a1)       /* bring the second line of src, addr 32 */
  94         pref    0, 64(a1)       /* bring the third line of src, addr 64 */
  95         pref    30, 32(a0)      /* safe, as we have at least 64 bytes ahead */
  96 /* In case the a0 > t9 don't use "pref 30" at all */
  97         sgtu    v1, a0, t9
  98         bgtz    v1, $loop16w    /* skip "pref 30, 64(a0)" for too short arrays */
  99         nop
 100 /* otherwise, start with using pref30 */
 101         pref    30, 64(a0)
 102 $loop16w:
 103         pref    0, 96(a1)
 104         lw      t0, 0(a1)
 105         bgtz    v1, $skip_pref30_96     /* skip "pref 30, 96(a0)" */
 106         lw      t1, 4(a1)
 107         pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
 108 $skip_pref30_96:
 109         lw      t2, 8(a1)
 110         lw      t3, 12(a1)
 111         lw      t4, 16(a1)
 112         lw      t5, 20(a1)
 113         lw      t6, 24(a1)
 114         lw      t7, 28(a1)
 115         pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
 116
 117         sw      t0, 0(a0)
 118         sw      t1, 4(a0)
 119         sw      t2, 8(a0)
 120         sw      t3, 12(a0)
 121         sw      t4, 16(a0)
 122         sw      t5, 20(a0)
 123         sw      t6, 24(a0)
 124         sw      t7, 28(a0)
 125
 126         lw      t0, 32(a1)
 127         bgtz    v1, $skip_pref30_128    /* skip "pref 30, 128(a0)" */
 128         lw      t1, 36(a1)
 129         pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
 130 $skip_pref30_128:
 131         lw      t2, 40(a1)
 132         lw      t3, 44(a1)
 133         lw      t4, 48(a1)
 134         lw      t5, 52(a1)
 135         lw      t6, 56(a1)
 136         lw      t7, 60(a1)
 137         pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
 138
 139         sw      t0, 32(a0)
 140         sw      t1, 36(a0)
 141         sw      t2, 40(a0)
 142         sw      t3, 44(a0)
 143         sw      t4, 48(a0)
 144         sw      t5, 52(a0)
 145         sw      t6, 56(a0)
 146         sw      t7, 60(a0)
 147
 148         addiu   a0, a0, 64      /* adding 64 to dest */
 149         sgtu    v1, a0, t9
 150         bne     a0, a3, $loop16w
 151         addiu   a1, a1, 64      /* adding 64 to src */
 152         move    a2, t8
 153
 154 /* Here we have src and dest word-aligned but less than 64-bytes to go */
 155
 156 $chk8w:
 157         pref 0, 0x0(a1)
 158         andi    t8, a2, 0x1f    /* is there a 32-byte chunk? */
 159                                 /* the t8 is the reminder count past 32-bytes */
 160         beq     a2, t8, $chk1w  /* when a2=t8, no 32-byte chunk */
 161          nop
 162
 163         lw      t0, 0(a1)
 164         lw      t1, 4(a1)
 165         lw      t2, 8(a1)
 166         lw      t3, 12(a1)
 167         lw      t4, 16(a1)
 168         lw      t5, 20(a1)
 169         lw      t6, 24(a1)
 170         lw      t7, 28(a1)
 171         addiu   a1, a1, 32
 172
 173         sw      t0, 0(a0)
 174         sw      t1, 4(a0)
 175         sw      t2, 8(a0)
 176         sw      t3, 12(a0)
 177         sw      t4, 16(a0)
 178         sw      t5, 20(a0)
 179         sw      t6, 24(a0)
 180         sw      t7, 28(a0)
 181         addiu   a0, a0, 32
 182
 183 $chk1w:
 184         andi    a2, t8, 0x3     /* now a2 is the reminder past 1w chunks */
 185         beq     a2, t8, $last8
 186         subu    a3, t8, a2      /* a3 is count of bytes in 1w chunks */
 187         addu    a3, a0, a3      /* now a3 is the dst address past the 1w chunks */
 188
 189 /* copying in words (4-byte chunks) */
 190 $wordCopy_loop:
 191         lw      t3, 0(a1)       /* the first t3 may be equal t0 ... optimize? */
 192         addiu   a1, a1, 4
 193         addiu   a0, a0, 4
 194         bne     a0, a3, $wordCopy_loop
 195         sw      t3, -4(a0)
 196
 197 /* For the last (<8) bytes */
 198 $last8:
 199         blez    a2, leave
 200         addu    a3, a0, a2      /* a3 is the last dst address */
 201 $last8loop:
 202         lb      v1, 0(a1)
 203         addiu   a1, a1, 1
 204         addiu   a0, a0, 1
 205         bne     a0, a3, $last8loop
 206         sb      v1, -1(a0)
 207
 208 leave:  j       ra
 209         nop
 210
 211 /*
 212  * UNALIGNED case
 213  */
 214
 215 $unaligned:
 216         /* got here with a3="negu a0" */
 217         andi    a3, a3, 0x3     /* test if the a0 is word aligned */
 218         beqz    a3, $ua_chk16w
 219         subu    a2, a2, a3      /* bytes left after initial a3 bytes */
 220
 221         LWHI    v1, 0(a1)
 222         LWLO    v1, 3(a1)
 223         addu    a1, a1, a3      /* a3 may be here 1, 2 or 3 */
 224         SWHI    v1, 0(a0)
 225         addu    a0, a0, a3      /* below the dst will be word aligned (NOTE1) */
 226
 227 $ua_chk16w:     andi    t8, a2, 0x3f    /* any whole 64-byte chunks? */
 228                                 /* t8 is the byte count after 64-byte chunks */
 229         beq     a2, t8, $ua_chk8w       /* if a2==t8, no 64-byte chunks */
 230                                 /* There will be at most 1 32-byte chunk after it */
 231         subu    a3, a2, t8      /* subtract from a2 the reminder */
 232                                 /* Here a3 counts bytes in 16w chunks */
 233         addu    a3, a0, a3      /* Now a3 is the final dst after 64-byte chunks */
 234
 235         addu    t0, a0, a2      /* t0 is the "past the end" address */
 236
 237         subu    t9, t0, 160     /* t9 is the "last safe pref 30, 128(a0)" address */
 238
 239         pref    0, 0(a1)                /* bring the first line of src, addr 0 */
 240         pref    0, 32(a1)       /* bring the second line of src, addr 32 */
 241         pref    0, 64(a1)       /* bring the third line of src, addr 64 */
 242         pref    30, 32(a0)      /* safe, as we have at least 64 bytes ahead */
 243 /* In case the a0 > t9 don't use "pref 30" at all */
 244         sgtu    v1, a0, t9
 245         bgtz    v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */
 246         nop
 247 /* otherwise,  start with using pref30 */
 248         pref    30, 64(a0)
 249 $ua_loop16w:
 250         pref    0, 96(a1)
 251         LWHI    t0, 0(a1)
 252         LWLO    t0, 3(a1)
 253         LWHI    t1, 4(a1)
 254         bgtz    v1, $ua_skip_pref30_96
 255         LWLO    t1, 7(a1)
 256         pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
 257 $ua_skip_pref30_96:
 258         LWHI    t2, 8(a1)
 259         LWLO    t2, 11(a1)
 260         LWHI    t3, 12(a1)
 261         LWLO    t3, 15(a1)
 262         LWHI    t4, 16(a1)
 263         LWLO    t4, 19(a1)
 264         LWHI    t5, 20(a1)
 265         LWLO    t5, 23(a1)
 266         LWHI    t6, 24(a1)
 267         LWLO    t6, 27(a1)
 268         LWHI    t7, 28(a1)
 269         LWLO    t7, 31(a1)
 270         pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
 271
 272         sw      t0, 0(a0)
 273         sw      t1, 4(a0)
 274         sw      t2, 8(a0)
 275         sw      t3, 12(a0)
 276         sw      t4, 16(a0)
 277         sw      t5, 20(a0)
 278         sw      t6, 24(a0)
 279         sw      t7, 28(a0)
 280
 281         LWHI    t0, 32(a1)
 282         LWLO    t0, 35(a1)
 283         LWHI    t1, 36(a1)
 284         bgtz    v1, $ua_skip_pref30_128
 285         LWLO    t1, 39(a1)
 286         pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
 287 $ua_skip_pref30_128:
 288         LWHI    t2, 40(a1)
 289         LWLO    t2, 43(a1)
 290         LWHI    t3, 44(a1)
 291         LWLO    t3, 47(a1)
 292         LWHI    t4, 48(a1)
 293         LWLO    t4, 51(a1)
 294         LWHI    t5, 52(a1)
 295         LWLO    t5, 55(a1)
 296         LWHI    t6, 56(a1)
 297         LWLO    t6, 59(a1)
 298         LWHI    t7, 60(a1)
 299         LWLO    t7, 63(a1)
 300         pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
 301
 302         sw      t0, 32(a0)
 303         sw      t1, 36(a0)
 304         sw      t2, 40(a0)
 305         sw      t3, 44(a0)
 306         sw      t4, 48(a0)
 307         sw      t5, 52(a0)
 308         sw      t6, 56(a0)
 309         sw      t7, 60(a0)
 310
 311         addiu   a0, a0, 64      /* adding 64 to dest */
 312         sgtu    v1, a0, t9
 313         bne     a0, a3, $ua_loop16w
 314         addiu   a1, a1, 64      /* adding 64 to src */
 315         move    a2, t8
 316
 317 /* Here we have src and dest word-aligned but less than 64-bytes to go */
 318
 319 $ua_chk8w:
 320         pref 0, 0x0(a1)
 321         andi    t8, a2, 0x1f    /* is there a 32-byte chunk? */
 322                                 /* the t8 is the reminder count */
 323         beq     a2, t8, $ua_chk1w       /* when a2=t8, no 32-byte chunk */
 324
 325         LWHI    t0, 0(a1)
 326         LWLO    t0, 3(a1)
 327         LWHI    t1, 4(a1)
 328         LWLO    t1, 7(a1)
 329         LWHI    t2, 8(a1)
 330         LWLO    t2, 11(a1)
 331         LWHI    t3, 12(a1)
 332         LWLO    t3, 15(a1)
 333         LWHI    t4, 16(a1)
 334         LWLO    t4, 19(a1)
 335         LWHI    t5, 20(a1)
 336         LWLO    t5, 23(a1)
 337         LWHI    t6, 24(a1)
 338         LWLO    t6, 27(a1)
 339         LWHI    t7, 28(a1)
 340         LWLO    t7, 31(a1)
 341         addiu   a1, a1, 32
 342
 343         sw      t0, 0(a0)
 344         sw      t1, 4(a0)
 345         sw      t2, 8(a0)
 346         sw      t3, 12(a0)
 347         sw      t4, 16(a0)
 348         sw      t5, 20(a0)
 349         sw      t6, 24(a0)
 350         sw      t7, 28(a0)
 351         addiu   a0, a0, 32
 352
 353 $ua_chk1w:
 354         andi    a2, t8, 0x3     /* now a2 is the reminder past 1w chunks */
 355         beq     a2, t8, $ua_smallCopy
 356         subu    a3, t8, a2      /* a3 is count of bytes in 1w chunks */
 357         addu    a3, a0, a3      /* now a3 is the dst address past the 1w chunks */
 358
 359 /* copying in words (4-byte chunks) */
 360 $ua_wordCopy_loop:
 361         LWHI    v1, 0(a1)
 362         LWLO    v1, 3(a1)
 363         addiu   a1, a1, 4
 364         addiu   a0, a0, 4               /* note: dst=a0 is word aligned here, see NOTE1 */
 365         bne     a0, a3, $ua_wordCopy_loop
 366         sw      v1, -4(a0)
 367
 368 /* Now less than 4 bytes (value in a2) left to copy */
 369 $ua_smallCopy:
 370         beqz    a2, leave
 371         addu    a3, a0, a2      /* a3 is the last dst address */
 372 $ua_smallCopy_loop:
 373         lb      v1, 0(a1)
 374         addiu   a1, a1, 1
 375         addiu   a0, a0, 1
 376         bne     a0, a3, $ua_smallCopy_loop
 377         sb      v1, -1(a0)
 378
 379         j       ra
 380         nop
 381
 382 END(pixman_mips_fast_memcpy)