src/third_party/skia/src/opts/memset16_neon.S

   1 /***************************************************************************
   2  * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
   3  *
   4  * Use of this source code is governed by a BSD-style license that can be
   5  * found in the LICENSE file.
   6  ***************************************************************************/
   7
   8 /***************************************************************************
   9   Neon memset: Attempts to do a memset with Neon registers if possible,
  10      Inputs:
  11         s: The buffer to write to
  12         c: The integer data to write to the buffer
  13         n: The size_t count.
  14      Outputs:
  15
  16 ***************************************************************************/
  17
  18         .syntax unified
  19
  20         .code 32
  21         .fpu neon
  22         .align 4
  23         .globl memset16_neon
  24
  25 memset16_neon:
  26         cmp             r2, #0
  27         bxeq            lr
  28
  29         /* Keep in mind that r2 -- the count argument -- is for the
  30          * number of 16-bit items to copy.
  31          */
  32         lsl             r2, r2, #1
  33
  34         push            {r0}
  35
  36         /* If we have < 8 bytes, just do a quick loop to handle that */
  37         cmp             r2, #8
  38         bgt             memset_gt4
  39 memset_smallcopy_loop:
  40         strh            r1, [r0], #2
  41         subs            r2, r2, #2
  42         bne             memset_smallcopy_loop
  43 memset_smallcopy_done:
  44         pop             {r0}
  45         bx              lr
  46
  47 memset_gt4:
  48         /*
  49          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
  50          * a register with two 16-bit-values we can copy. We do this by
  51          * duplicating lowest 16-bits of r1 to upper 16-bits.
  52          */
  53         orr             r1, r1, r1, lsl #16
  54         /*
  55          * If we're copying > 64 bytes, then we may want to get
  56          * onto a 16-byte boundary to improve speed even more.
  57          */
  58         cmp             r2, #64
  59         blt             memset_route
  60         ands            r12, r0, #0xf
  61         beq             memset_route
  62         /*
  63          * Determine the number of bytes to move forward to get to the 16-byte
  64          * boundary.  Note that this will be a multiple of 4, since we
  65          * already are word-aligned.
  66          */
  67         rsb             r12, r12, #16
  68         sub             r2, r2, r12
  69         lsls            r12, r12, #29
  70         strmi           r1, [r0], #4
  71         strcs           r1, [r0], #4
  72         strcs           r1, [r0], #4
  73         lsls            r12, r12, #2
  74         strhcs          r1, [r0], #2
  75 memset_route:
  76         /*
  77          * Decide where to route for the maximum copy sizes.  Note that we
  78          * build q0 and q1 depending on if we'll need it, so that's
  79          * interwoven here as well.
  80          */
  81         vdup.u32        d0, r1
  82         cmp             r2, #16
  83         blt             memset_8
  84         vmov            d1, d0
  85         cmp             r2, #64
  86         blt             memset_16
  87         vmov            q1, q0
  88         cmp             r2, #128
  89         blt             memset_32
  90 memset_128:
  91         mov             r12, r2, lsr #7
  92 memset_128_loop:
  93         vst1.64         {q0, q1}, [r0]!
  94         vst1.64         {q0, q1}, [r0]!
  95         vst1.64         {q0, q1}, [r0]!
  96         vst1.64         {q0, q1}, [r0]!
  97         subs            r12, r12, #1
  98         bne             memset_128_loop
  99         ands            r2, r2, #0x7f
 100         beq             memset_end
 101 memset_32:
 102         movs            r12, r2, lsr #5
 103         beq             memset_16
 104 memset_32_loop:
 105         subs            r12, r12, #1
 106         vst1.64         {q0, q1}, [r0]!
 107         bne             memset_32_loop
 108         ands            r2, r2, #0x1f
 109         beq             memset_end
 110 memset_16:
 111         movs            r12, r2, lsr #4
 112         beq             memset_8
 113 memset_16_loop:
 114         subs            r12, r12, #1
 115         vst1.32         {q0}, [r0]!
 116         bne             memset_16_loop
 117         ands            r2, r2, #0xf
 118         beq             memset_end
 119         /*
 120          * memset_8 isn't a loop, since we try to do our loops at 16
 121          * bytes and above.  We should loop there, then drop down here
 122          * to finish the <16-byte versions.  Same for memset_4 and
 123          * memset_1.
 124          */
 125 memset_8:
 126         cmp             r2, #8
 127         blt             memset_4
 128         subs            r2, r2, #8
 129         vst1.32         {d0}, [r0]!
 130 memset_4:
 131         cmp             r2, #4
 132         blt             memset_2
 133         subs            r2, r2, #4
 134         str             r1, [r0], #4
 135 memset_2:
 136         cmp             r2, #0
 137         ble             memset_end
 138         strh            r1, [r0], #2
 139 memset_end:
 140         pop             {r0}
 141         bx              lr
 142
 143         .end