src/video/arm/pixman-arm-neon-asm.S

   1 /*
   2  * Copyright © 2009 Nokia Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
  24  */
  25
  26 /*
  27  * Copyright (c) 2018 RISC OS Open Ltd
  28  *
  29  * This software is provided 'as-is', without any express or implied
  30  * warranty.  In no event will the authors be held liable for any damages
  31  * arising from the use of this software.
  32  *
  33  * Permission is granted to anyone to use this software for any purpose,
  34  * including commercial applications, and to alter it and redistribute it
  35  * freely, subject to the following restrictions:
  36  *
  37  * 1. The origin of this software must not be misrepresented; you must not
  38  *    claim that you wrote the original software. If you use this software
  39  *    in a product, an acknowledgment in the product documentation would be
  40  *    appreciated but is not required.
  41  * 2. Altered source versions must be plainly marked as such, and must not be
  42  *    misrepresented as being the original software.
  43  * 3. This notice may not be removed or altered from any source distribution.
  44  */
  45
  46 /* Prevent the stack from becoming executable for no reason... */
  47 #if defined(__linux__) && defined(__ELF__)
  48 .section .note.GNU-stack,"",%progbits
  49 #endif
  50
  51     .text
  52     .fpu neon
  53     .arch armv7a
  54     .object_arch armv4
  55     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
  56     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
  57     .arm
  58     .altmacro
  59     .p2align 2
  60
  61 #include "pixman-arm-asm.h"
  62 #include "pixman-arm-neon-asm.h"
  63
  64 /* Global configuration options and preferences */
  65
  66 /*
  67  * The code can optionally make use of unaligned memory accesses to improve
  68  * performance of handling leading/trailing pixels for each scanline.
  69  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
  70  * example in linux if unaligned memory accesses are not configured to
  71  * generate.exceptions.
  72  */
  73 .set RESPECT_STRICT_ALIGNMENT, 1
  74
  75 /*
  76  * Set default prefetch type. There is a choice between the following options:
  77  *
  78  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
  79  * as NOP to workaround some HW bugs or for whatever other reason)
  80  *
  81  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
  82  * advanced prefetch intruduces heavy overhead)
  83  *
  84  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
  85  * which can run ARM and NEON instructions simultaneously so that extra ARM
  86  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
  87  *
  88  * Note: some types of function can't support advanced prefetch and fallback
  89  *       to simple one (those which handle 24bpp pixels)
  90  */
  91 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
  92
  93 /* Prefetch distance in pixels for simple prefetch */
  94 .set PREFETCH_DISTANCE_SIMPLE, 64
  95
  96 /******************************************************************************/
  97
  98 /* We can actually do significantly better than the Pixman macros, at least for
  99  * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
 100  * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
 101  */
 102
 103 .macro generate_fillrect_function name, bpp, log2Bpp
 104 /*
 105  * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
 106  * On entry:
 107  * a1 = width, pixels
 108  * a2 = height, rows
 109  * a3 = pointer to top-left destination pixel
 110  * a4 = stride, pixels
 111  * [sp] = pixel value to fill with
 112  * Within the function:
 113  * v1 = width remaining
 114  * v2 = vst offset
 115  * v3 = alternate pointer
 116  * ip = data ARM register
 117  */
 118 pixman_asm_function name
 119     vld1.\bpp   {d0[],d1[]}, [sp]
 120     sub         a4, a1
 121     vld1.\bpp   {d2[],d3[]}, [sp]
 122     cmp         a1, #(15+64) >> \log2Bpp
 123     push        {v1-v3,lr}
 124     vmov        ip, s0
 125     blo         51f
 126
 127     /* Long-row case */
 128     mov         v2, #64
 129 1:  mov         v1, a1
 130     ands        v3, a3, #15
 131     beq         2f
 132     /* Leading pixels */
 133     rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
 134     sub         v1, v1, v3, lsr #\log2Bpp
 135     rbit        v3, v3
 136 .if bpp <= 16
 137 .if bpp == 8
 138     tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
 139     strneb      ip, [a3], #1
 140     tst         v3, #1<<30
 141 .else
 142     tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
 143 .endif
 144     strneh      ip, [a3], #2
 145 .endif
 146     movs        v3, v3, lsl #3
 147     vstmcs      a3!, {s0}
 148     vstmmi      a3!, {d0}
 149 2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
 150     add         v3, a3, #32
 151     /* Inner loop */
 152 3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
 153     subs        v1, v1, #64 >> \log2Bpp
 154     vst1.\bpp   {q0-q1}, [v3 :128], v2
 155     bhs         3b
 156     /* Trailing pixels */
 157 4:  movs        v1, v1, lsl #27 + \log2Bpp
 158     bcc         5f
 159     vst1.\bpp   {q0-q1}, [a3 :128]!
 160 5:  bpl         6f
 161     vst1.\bpp   {q0}, [a3 :128]!
 162 6:  movs        v1, v1, lsl #2
 163     vstmcs      a3!, {d0}
 164     vstmmi      a3!, {s0}
 165 .if bpp <= 16
 166     movs        v1, v1, lsl #2
 167     strcsh      ip, [a3], #2
 168 .if bpp == 8
 169     strmib      ip, [a3], #1
 170 .endif
 171 .endif
 172     subs        a2, a2, #1
 173     add         a3, a3, a4, lsl #\log2Bpp
 174     bhi         1b
 175     pop         {v1-v3,pc}
 176
 177     /* Short-row case */
 178 51: movs        v1, a1
 179 .if bpp == 8
 180     tst         a3, #3
 181     beq         53f
 182 52: subs        v1, v1, #1
 183     blo         57f
 184     strb        ip, [a3], #1
 185     tst         a3, #3
 186     bne         52b
 187 .elseif bpp == 16
 188     tstne       a3, #2
 189     subne       v1, v1, #1
 190     strneh      ip, [a3], #2
 191 .endif
 192 53: cmp         v1, #32 >> \log2Bpp
 193     bcc         54f
 194     vst1.\bpp   {q0-q1}, [a3]!
 195     sub         v1, v1, #32 >> \log2Bpp
 196     /* Trailing pixels */
 197 54: movs        v1, v1, lsl #27 + \log2Bpp
 198     bcc         55f
 199     vst1.\bpp   {q0-q1}, [a3]!
 200 55: bpl         56f
 201     vst1.\bpp   {q0}, [a3]!
 202 56: movs        v1, v1, lsl #2
 203     vstmcs      a3!, {d0}
 204     vstmmi      a3!, {s0}
 205 .if bpp <= 16
 206     movs        v1, v1, lsl #2
 207     strcsh      ip, [a3], #2
 208 .if bpp == 8
 209     strmib      ip, [a3], #1
 210 .endif
 211 .endif
 212     subs        a2, a2, #1
 213     add         a3, a3, a4, lsl #\log2Bpp
 214     bhi         51b
 215 57: pop         {v1-v3,pc}
 216
 217 .endfunc
 218 .endm
 219
 220 generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
 221 generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
 222 generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
 223
 224 /******************************************************************************/
 225
 226 .macro RGBtoRGBPixelAlpha_process_pixblock_head
 227     vmvn        d30, d3  /* get inverted source alpha */
 228     vmov        d31, d7  /* dest alpha is always unchanged */
 229     vmull.u8    q14, d0, d3
 230     vmlal.u8    q14, d4, d30
 231     vmull.u8    q0, d1, d3
 232     vmlal.u8    q0, d5, d30
 233     vmull.u8    q1, d2, d3
 234     vmlal.u8    q1, d6, d30
 235     vrshr.u16   q2, q14, #8
 236     vrshr.u16   q3, q0, #8
 237     vraddhn.u16 d28, q14, q2
 238     vrshr.u16   q2, q1, #8
 239     vraddhn.u16 d29, q0, q3
 240     vraddhn.u16 d30, q1, q2
 241 .endm
 242
 243 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
 244     /* nothing */
 245 .endm
 246
 247 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
 248     vld4.8      {d0-d3}, [SRC]!
 249                                     PF add PF_X, PF_X, #8
 250         vst4.8      {d28-d31}, [DST_W :128]!
 251                                     PF tst PF_CTL, #0xF
 252     vld4.8      {d4-d7}, [DST_R :128]!
 253                                     PF addne PF_X, PF_X, #8
 254     vmvn        d30, d3  /* get inverted source alpha */
 255     vmov        d31, d7  /* dest alpha is always unchanged */
 256     vmull.u8    q14, d0, d3
 257                                     PF subne PF_CTL, PF_CTL, #1
 258     vmlal.u8    q14, d4, d30
 259                                     PF cmp PF_X, ORIG_W
 260     vmull.u8    q0, d1, d3
 261                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 262     vmlal.u8    q0, d5, d30
 263                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 264     vmull.u8    q1, d2, d3
 265                                     PF subge PF_X, PF_X, ORIG_W
 266     vmlal.u8    q1, d6, d30
 267                                     PF subges PF_CTL, PF_CTL, #0x10
 268     vrshr.u16   q2, q14, #8
 269                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 270     vrshr.u16   q3, q0, #8
 271                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 272     vraddhn.u16 d28, q14, q2
 273     vrshr.u16   q2, q1, #8
 274     vraddhn.u16 d29, q0, q3
 275     vraddhn.u16 d30, q1, q2
 276 .endm
 277
 278 generate_composite_function \
 279     BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
 280     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
 281     8, /* number of pixels, processed in a single block */ \
 282     5, /* prefetch distance */ \
 283     default_init, \
 284     default_cleanup, \
 285     RGBtoRGBPixelAlpha_process_pixblock_head, \
 286     RGBtoRGBPixelAlpha_process_pixblock_tail, \
 287     RGBtoRGBPixelAlpha_process_pixblock_tail_head
 288
 289  /******************************************************************************/
 290
 291 .macro ARGBto565PixelAlpha_process_pixblock_head
 292     vmvn        d6, d3
 293     vshr.u8     d1, #2
 294     vshr.u8     d3, #3
 295     vshr.u8     d0, #3
 296     vshrn.u16   d7, q2, #3
 297     vshrn.u16   d25, q2, #8
 298     vbic.i16    q2, #0xe0
 299     vshr.u8     d6, #3
 300     vshr.u8     d7, #2
 301     vshr.u8     d2, #3
 302     vmovn.u16   d24, q2
 303     vshr.u8     d25, #3
 304     vmull.u8    q13, d1, d3
 305     vmlal.u8    q13, d7, d6
 306     vmull.u8    q14, d0, d3
 307     vmlal.u8    q14, d24, d6
 308     vmull.u8    q15, d2, d3
 309     vmlal.u8    q15, d25, d6
 310 .endm
 311
 312 .macro ARGBto565PixelAlpha_process_pixblock_tail
 313     vsra.u16    q13, #5
 314     vsra.u16    q14, #5
 315     vsra.u16    q15, #5
 316     vrshr.u16   q13, #5
 317     vrshr.u16   q14, #5
 318     vrshr.u16   q15, #5
 319     vsli.u16    q14, q13, #5
 320     vsli.u16    q14, q15, #11
 321 .endm
 322
 323 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
 324     vld4.8      {d0-d3}, [SRC]!
 325                                     PF add PF_X, PF_X, #8
 326         vsra.u16    q13, #5
 327                                     PF tst PF_CTL, #0xF
 328         vsra.u16    q14, #5
 329                                     PF addne PF_X, PF_X, #8
 330         vsra.u16    q15, #5
 331                                     PF subne PF_CTL, PF_CTL, #1
 332         vrshr.u16   q13, #5
 333                                     PF cmp PF_X, ORIG_W
 334         vrshr.u16   q14, #5
 335                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 336         vrshr.u16   q15, #5
 337                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 338     vld1.8      {d4-d5}, [DST_R]!
 339                                     PF subge PF_X, PF_X, ORIG_W
 340         vsli.u16    q14, q13, #5
 341                                     PF subges PF_CTL, PF_CTL, #0x10
 342         vsli.u16    q14, q15, #11
 343                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 344         vst1.8      {q14}, [DST_W :128]!
 345     vmvn        d6, d3
 346     vshr.u8     d1, #2
 347     vshr.u8     d3, #3
 348     vshr.u8     d0, #3
 349     vshrn.u16   d7, q2, #3
 350     vshrn.u16   d25, q2, #8
 351     vbic.i16    q2, #0xe0
 352                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 353     vshr.u8     d6, #3
 354     vshr.u8     d7, #2
 355     vshr.u8     d2, #3
 356     vmovn.u16   d24, q2
 357     vshr.u8     d25, #3
 358     vmull.u8    q13, d1, d3
 359     vmlal.u8    q13, d7, d6
 360     vmull.u8    q14, d0, d3
 361     vmlal.u8    q14, d24, d6
 362     vmull.u8    q15, d2, d3
 363     vmlal.u8    q15, d25, d6
 364 .endm
 365
 366 generate_composite_function \
 367     BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
 368     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
 369     8, /* number of pixels, processed in a single block */ \
 370     6, /* prefetch distance */ \
 371     default_init, \
 372     default_cleanup, \
 373     ARGBto565PixelAlpha_process_pixblock_head, \
 374     ARGBto565PixelAlpha_process_pixblock_tail, \
 375     ARGBto565PixelAlpha_process_pixblock_tail_head