pixman/pixman-arm-neon-asm.h

   1 /*
   2  * Copyright © 2009 Nokia Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
  24  */
  25
  26 /*
  27  * This file contains a macro ('generate_composite_function') which can
  28  * construct 2D image processing functions, based on a common template.
  29  * Any combinations of source, destination and mask images with 8bpp,
  30  * 16bpp, 24bpp, 32bpp color formats are supported.
  31  *
  32  * This macro takes care of:
  33  *  - handling of leading and trailing unaligned pixels
  34  *  - doing most of the work related to L2 cache preload
  35  *  - encourages the use of software pipelining for better instructions
  36  *    scheduling
  37  *
  38  * The user of this macro has to provide some configuration parameters
  39  * (bit depths for the images, prefetch distance, etc.) and a set of
  40  * macros, which should implement basic code chunks responsible for
  41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
  42  * examples.
  43  *
  44  * TODO:
  45  *  - try overlapped pixel method (from Ian Rickards) when processing
  46  *    exactly two blocks of pixels
  47  *  - maybe add an option to do reverse scanline processing
  48  */
  49
  50 /*
  51  * Bit flags for 'generate_composite_function' macro which are used
  52  * to tune generated functions behavior.
  53  */
  54 .set FLAG_DST_WRITEONLY,       0
  55 .set FLAG_DST_READWRITE,       1
  56 .set FLAG_DEINTERLEAVE_32BPP,  2
  57
  58 /*
  59  * Offset in stack where mask and source pointer/stride can be accessed
  60  * from 'init' macro. This is useful for doing special handling for solid mask.
  61  */
  62 .set ARGS_STACK_OFFSET,        40
  63
  64 /*
  65  * Constants for selecting preferable prefetch type.
  66  */
  67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
  68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
  69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
  70
  71 /*
  72  * Definitions of supplementary pixld/pixst macros (for partial load/store of
  73  * pixel data).
  74  */
  75
  76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
  77 .if abits > 0
  78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
  79 .else
  80     op&.&elem_size {d&reg1}, [&mem_operand&]!
  81 .endif
  82 .endm
  83
  84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
  85 .if abits > 0
  86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
  87 .else
  88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
  89 .endif
  90 .endm
  91
  92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
  93 .if abits > 0
  94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
  95 .else
  96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
  97 .endif
  98 .endm
  99
 100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
 101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
 102 .endm
 103
 104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
 105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
 106 .endm
 107
 108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
 109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
 110 .endm
 111
 112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
 113 .if numbytes == 32
 114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
 115                               %(basereg+6), %(basereg+7), mem_operand, abits
 116 .elseif numbytes == 16
 117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
 118 .elseif numbytes == 8
 119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
 120 .elseif numbytes == 4
 121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
 122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
 123     .elseif elem_size == 16
 124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
 125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
 126     .else
 127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
 128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
 129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
 130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
 131     .endif
 132 .elseif numbytes == 2
 133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
 134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
 135     .else
 136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
 137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
 138     .endif
 139 .elseif numbytes == 1
 140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
 141 .else
 142     .error "unsupported size: numbytes"
 143 .endif
 144 .endm
 145
 146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
 147 .if bpp > 0
 148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
 150                       %(basereg+6), %(basereg+7), mem_operand, abits
 151 .elseif (bpp == 24) && (numpix == 8)
 152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
 153 .elseif (bpp == 24) && (numpix == 4)
 154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
 155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
 156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
 157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
 158 .elseif (bpp == 24) && (numpix == 2)
 159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
 160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
 161 .elseif (bpp == 24) && (numpix == 1)
 162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
 163 .else
 164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
 165 .endif
 166 .endif
 167 .endm
 168
 169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
 170 .if bpp > 0
 171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
 173                       %(basereg+6), %(basereg+7), mem_operand, abits
 174 .elseif (bpp == 24) && (numpix == 8)
 175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
 176 .elseif (bpp == 24) && (numpix == 4)
 177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
 178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
 179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
 180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
 181 .elseif (bpp == 24) && (numpix == 2)
 182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
 183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
 184 .elseif (bpp == 24) && (numpix == 1)
 185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
 186 .else
 187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
 188 .endif
 189 .endif
 190 .endm
 191
 192 .macro pixld_a numpix, bpp, basereg, mem_operand
 193 .if (bpp * numpix) <= 128
 194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
 195 .else
 196     pixld numpix, bpp, basereg, mem_operand, 128
 197 .endif
 198 .endm
 199
 200 .macro pixst_a numpix, bpp, basereg, mem_operand
 201 .if (bpp * numpix) <= 128
 202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
 203 .else
 204     pixst numpix, bpp, basereg, mem_operand, 128
 205 .endif
 206 .endm
 207
 208 /*
 209  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
 210  * aliases to be defined)
 211  */
 212 .macro pixld1_s elem_size, reg1, mem_operand
 213 .if elem_size == 16
 214     mov     TMP1, VX, asr #16
 215     add     VX, VX, UNIT_X
 216     add     TMP1, mem_operand, TMP1, asl #1
 217     mov     TMP2, VX, asr #16
 218     add     VX, VX, UNIT_X
 219     add     TMP2, mem_operand, TMP2, asl #1
 220     vld1.16 {d&reg1&[0]}, [TMP1, :16]
 221     mov     TMP1, VX, asr #16
 222     add     VX, VX, UNIT_X
 223     add     TMP1, mem_operand, TMP1, asl #1
 224     vld1.16 {d&reg1&[1]}, [TMP2, :16]
 225     mov     TMP2, VX, asr #16
 226     add     VX, VX, UNIT_X
 227     add     TMP2, mem_operand, TMP2, asl #1
 228     vld1.16 {d&reg1&[2]}, [TMP1, :16]
 229     vld1.16 {d&reg1&[3]}, [TMP2, :16]
 230 .elseif elem_size == 32
 231     mov     TMP1, VX, asr #16
 232     add     VX, VX, UNIT_X
 233     add     TMP1, mem_operand, TMP1, asl #2
 234     mov     TMP2, VX, asr #16
 235     add     VX, VX, UNIT_X
 236     add     TMP2, mem_operand, TMP2, asl #2
 237     vld1.32 {d&reg1&[0]}, [TMP1, :32]
 238     vld1.32 {d&reg1&[1]}, [TMP2, :32]
 239 .else
 240     .error "unsupported"
 241 .endif
 242 .endm
 243
 244 .macro pixld2_s elem_size, reg1, reg2, mem_operand
 245 .if elem_size == 32
 246     mov     TMP1, VX, asr #16
 247     add     VX, VX, UNIT_X, asl #1
 248     add     TMP1, mem_operand, TMP1, asl #2
 249     mov     TMP2, VX, asr #16
 250     sub     VX, VX, UNIT_X
 251     add     TMP2, mem_operand, TMP2, asl #2
 252     vld1.32 {d&reg1&[0]}, [TMP1, :32]
 253     mov     TMP1, VX, asr #16
 254     add     VX, VX, UNIT_X, asl #1
 255     add     TMP1, mem_operand, TMP1, asl #2
 256     vld1.32 {d&reg2&[0]}, [TMP2, :32]
 257     mov     TMP2, VX, asr #16
 258     add     VX, VX, UNIT_X
 259     add     TMP2, mem_operand, TMP2, asl #2
 260     vld1.32 {d&reg1&[1]}, [TMP1, :32]
 261     vld1.32 {d&reg2&[1]}, [TMP2, :32]
 262 .else
 263     pixld1_s elem_size, reg1, mem_operand
 264     pixld1_s elem_size, reg2, mem_operand
 265 .endif
 266 .endm
 267
 268 .macro pixld0_s elem_size, reg1, idx, mem_operand
 269 .if elem_size == 16
 270     mov     TMP1, VX, asr #16
 271     add     VX, VX, UNIT_X
 272     add     TMP1, mem_operand, TMP1, asl #1
 273     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
 274 .elseif elem_size == 32
 275     mov     TMP1, VX, asr #16
 276     add     VX, VX, UNIT_X
 277     add     TMP1, mem_operand, TMP1, asl #2
 278     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
 279 .endif
 280 .endm
 281
 282 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
 283 .if numbytes == 32
 284     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
 285     pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
 286     pixdeinterleave elem_size, %(basereg+4)
 287 .elseif numbytes == 16
 288     pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
 289 .elseif numbytes == 8
 290     pixld1_s elem_size, %(basereg+1), mem_operand
 291 .elseif numbytes == 4
 292     .if elem_size == 32
 293         pixld0_s elem_size, %(basereg+0), 1, mem_operand
 294     .elseif elem_size == 16
 295         pixld0_s elem_size, %(basereg+0), 2, mem_operand
 296         pixld0_s elem_size, %(basereg+0), 3, mem_operand
 297     .else
 298         pixld0_s elem_size, %(basereg+0), 4, mem_operand
 299         pixld0_s elem_size, %(basereg+0), 5, mem_operand
 300         pixld0_s elem_size, %(basereg+0), 6, mem_operand
 301         pixld0_s elem_size, %(basereg+0), 7, mem_operand
 302     .endif
 303 .elseif numbytes == 2
 304     .if elem_size == 16
 305         pixld0_s elem_size, %(basereg+0), 1, mem_operand
 306     .else
 307         pixld0_s elem_size, %(basereg+0), 2, mem_operand
 308         pixld0_s elem_size, %(basereg+0), 3, mem_operand
 309     .endif
 310 .elseif numbytes == 1
 311     pixld0_s elem_size, %(basereg+0), 1, mem_operand
 312 .else
 313     .error "unsupported size: numbytes"
 314 .endif
 315 .endm
 316
 317 .macro pixld_s numpix, bpp, basereg, mem_operand
 318 .if bpp > 0
 319     pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
 320 .endif
 321 .endm
 322
 323 .macro vuzp8 reg1, reg2
 324     vuzp.8 d&reg1, d&reg2
 325 .endm
 326
 327 .macro vzip8 reg1, reg2
 328     vzip.8 d&reg1, d&reg2
 329 .endm
 330
 331 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 332 .macro pixdeinterleave bpp, basereg
 333 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 334     vuzp8 %(basereg+0), %(basereg+1)
 335     vuzp8 %(basereg+2), %(basereg+3)
 336     vuzp8 %(basereg+1), %(basereg+3)
 337     vuzp8 %(basereg+0), %(basereg+2)
 338 .endif
 339 .endm
 340
 341 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
 342 .macro pixinterleave bpp, basereg
 343 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
 344     vzip8 %(basereg+0), %(basereg+2)
 345     vzip8 %(basereg+1), %(basereg+3)
 346     vzip8 %(basereg+2), %(basereg+3)
 347     vzip8 %(basereg+0), %(basereg+1)
 348 .endif
 349 .endm
 350
 351 /*
 352  * This is a macro for implementing cache preload. The main idea is that
 353  * cache preload logic is mostly independent from the rest of pixels
 354  * processing code. It starts at the top left pixel and moves forward
 355  * across pixels and can jump across scanlines. Prefetch distance is
 356  * handled in an 'incremental' way: it starts from 0 and advances to the
 357  * optimal distance over time. After reaching optimal prefetch distance,
 358  * it is kept constant. There are some checks which prevent prefetching
 359  * unneeded pixel lines below the image (but it still can prefetch a bit
 360  * more data on the right side of the image - not a big issue and may
 361  * be actually helpful when rendering text glyphs). Additional trick is
 362  * the use of LDR instruction for prefetch instead of PLD when moving to
 363  * the next line, the point is that we have a high chance of getting TLB
 364  * miss in this case, and PLD would be useless.
 365  *
 366  * This sounds like it may introduce a noticeable overhead (when working with
 367  * fully cached data). But in reality, due to having a separate pipeline and
 368  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
 369  * execute simultaneously with NEON and be completely shadowed by it. Thus
 370  * we get no performance overhead at all (*). This looks like a very nice
 371  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
 372  * but still can implement some rather advanced prefetch logic in sofware
 373  * for almost zero cost!
 374  *
 375  * (*) The overhead of the prefetcher is visible when running some trivial
 376  * pixels processing like simple copy. Anyway, having prefetch is a must
 377  * when working with the graphics data.
 378  */
 379 .macro PF a, x:vararg
 380 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
 381     a x
 382 .endif
 383 .endm
 384
 385 .macro cache_preload std_increment, boost_increment
 386 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
 387 .if regs_shortage
 388     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
 389 .endif
 390 .if std_increment != 0
 391     PF add PF_X, PF_X, #std_increment
 392 .endif
 393     PF tst PF_CTL, #0xF
 394     PF addne PF_X, PF_X, #boost_increment
 395     PF subne PF_CTL, PF_CTL, #1
 396     PF cmp PF_X, ORIG_W
 397 .if src_bpp_shift >= 0
 398     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 399 .endif
 400 .if dst_r_bpp != 0
 401     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 402 .endif
 403 .if mask_bpp_shift >= 0
 404     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
 405 .endif
 406     PF subge PF_X, PF_X, ORIG_W
 407     PF subges PF_CTL, PF_CTL, #0x10
 408 .if src_bpp_shift >= 0
 409     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 410 .endif
 411 .if dst_r_bpp != 0
 412     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 413 .endif
 414 .if mask_bpp_shift >= 0
 415     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
 416 .endif
 417 .endif
 418 .endm
 419
 420 .macro cache_preload_simple
 421 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
 422 .if src_bpp > 0
 423     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
 424 .endif
 425 .if dst_r_bpp > 0
 426     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
 427 .endif
 428 .if mask_bpp > 0
 429     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
 430 .endif
 431 .endif
 432 .endm
 433
 434 .macro fetch_mask_pixblock
 435     pixld       pixblock_size, mask_bpp, \
 436                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
 437 .endm
 438
 439 /*
 440  * Macro which is used to process leading pixels until destination
 441  * pointer is properly aligned (at 16 bytes boundary). When destination
 442  * buffer uses 16bpp format, this is unnecessary, or even pointless.
 443  */
 444 .macro ensure_destination_ptr_alignment process_pixblock_head, \
 445                                         process_pixblock_tail, \
 446                                         process_pixblock_tail_head
 447 .if dst_w_bpp != 24
 448     tst         DST_R, #0xF
 449     beq         2f
 450
 451 .irp lowbit, 1, 2, 4, 8, 16
 452 local skip1
 453 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
 454 .if lowbit < 16 /* we don't need more than 16-byte alignment */
 455     tst         DST_R, #lowbit
 456     beq         1f
 457 .endif
 458     pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
 459     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
 460 .if dst_r_bpp > 0
 461     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
 462 .else
 463     add         DST_R, DST_R, #lowbit
 464 .endif
 465     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
 466     sub         W, W, #(lowbit * 8 / dst_w_bpp)
 467 1:
 468 .endif
 469 .endr
 470     pixdeinterleave src_bpp, src_basereg
 471     pixdeinterleave mask_bpp, mask_basereg
 472     pixdeinterleave dst_r_bpp, dst_r_basereg
 473
 474     process_pixblock_head
 475     cache_preload 0, pixblock_size
 476     cache_preload_simple
 477     process_pixblock_tail
 478
 479     pixinterleave dst_w_bpp, dst_w_basereg
 480 .irp lowbit, 1, 2, 4, 8, 16
 481 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
 482 .if lowbit < 16 /* we don't need more than 16-byte alignment */
 483     tst         DST_W, #lowbit
 484     beq         1f
 485 .endif
 486     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
 487 1:
 488 .endif
 489 .endr
 490 .endif
 491 2:
 492 .endm
 493
 494 /*
 495  * Special code for processing up to (pixblock_size - 1) remaining
 496  * trailing pixels. As SIMD processing performs operation on
 497  * pixblock_size pixels, anything smaller than this has to be loaded
 498  * and stored in a special way. Loading and storing of pixel data is
 499  * performed in such a way that we fill some 'slots' in the NEON
 500  * registers (some slots naturally are unused), then perform compositing
 501  * operation as usual. In the end, the data is taken from these 'slots'
 502  * and saved to memory.
 503  *
 504  * cache_preload_flag - allows to suppress prefetch if
 505  *                      set to 0
 506  * dst_aligned_flag   - selects whether destination buffer
 507  *                      is aligned
 508  */
 509 .macro process_trailing_pixels cache_preload_flag, \
 510                                dst_aligned_flag, \
 511                                process_pixblock_head, \
 512                                process_pixblock_tail, \
 513                                process_pixblock_tail_head
 514     tst         W, #(pixblock_size - 1)
 515     beq         2f
 516 .irp chunk_size, 16, 8, 4, 2, 1
 517 .if pixblock_size > chunk_size
 518     tst         W, #chunk_size
 519     beq         1f
 520     pixld_src   chunk_size, src_bpp, src_basereg, SRC
 521     pixld       chunk_size, mask_bpp, mask_basereg, MASK
 522 .if dst_aligned_flag != 0
 523     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 524 .else
 525     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
 526 .endif
 527 .if cache_preload_flag != 0
 528     PF add      PF_X, PF_X, #chunk_size
 529 .endif
 530 1:
 531 .endif
 532 .endr
 533     pixdeinterleave src_bpp, src_basereg
 534     pixdeinterleave mask_bpp, mask_basereg
 535     pixdeinterleave dst_r_bpp, dst_r_basereg
 536
 537     process_pixblock_head
 538 .if cache_preload_flag != 0
 539     cache_preload 0, pixblock_size
 540     cache_preload_simple
 541 .endif
 542     process_pixblock_tail
 543     pixinterleave dst_w_bpp, dst_w_basereg
 544 .irp chunk_size, 16, 8, 4, 2, 1
 545 .if pixblock_size > chunk_size
 546     tst         W, #chunk_size
 547     beq         1f
 548 .if dst_aligned_flag != 0
 549     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 550 .else
 551     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
 552 .endif
 553 1:
 554 .endif
 555 .endr
 556 2:
 557 .endm
 558
 559 /*
 560  * Macro, which performs all the needed operations to switch to the next
 561  * scanline and start the next loop iteration unless all the scanlines
 562  * are already processed.
 563  */
 564 .macro advance_to_next_scanline start_of_loop_label
 565 .if regs_shortage
 566     ldrd        W, [sp] /* load W and H (width and height) from stack */
 567 .else
 568     mov         W, ORIG_W
 569 .endif
 570     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
 571 .if src_bpp != 0
 572     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
 573 .endif
 574 .if mask_bpp != 0
 575     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
 576 .endif
 577 .if (dst_w_bpp != 24)
 578     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
 579 .endif
 580 .if (src_bpp != 24) && (src_bpp != 0)
 581     sub         SRC, SRC, W, lsl #src_bpp_shift
 582 .endif
 583 .if (mask_bpp != 24) && (mask_bpp != 0)
 584     sub         MASK, MASK, W, lsl #mask_bpp_shift
 585 .endif
 586     subs        H, H, #1
 587     mov         DST_R, DST_W
 588 .if regs_shortage
 589     str         H, [sp, #4] /* save updated height to stack */
 590 .endif
 591     bge         start_of_loop_label
 592 .endm
 593
 594 /*
 595  * Registers are allocated in the following way by default:
 596  * d0, d1, d2, d3     - reserved for loading source pixel data
 597  * d4, d5, d6, d7     - reserved for loading destination pixel data
 598  * d24, d25, d26, d27 - reserved for loading mask pixel data
 599  * d28, d29, d30, d31 - final destination pixel data for writeback to memory
 600  */
 601 .macro generate_composite_function fname, \
 602                                    src_bpp_, \
 603                                    mask_bpp_, \
 604                                    dst_w_bpp_, \
 605                                    flags, \
 606                                    pixblock_size_, \
 607                                    prefetch_distance, \
 608                                    init, \
 609                                    cleanup, \
 610                                    process_pixblock_head, \
 611                                    process_pixblock_tail, \
 612                                    process_pixblock_tail_head, \
 613                                    dst_w_basereg_ = 28, \
 614                                    dst_r_basereg_ = 4, \
 615                                    src_basereg_   = 0, \
 616                                    mask_basereg_  = 24
 617
 618     .func fname
 619     .global fname
 620     /* For ELF format also set function visibility to hidden */
 621 #ifdef __ELF__
 622     .hidden fname
 623     .type fname, %function
 624 #endif
 625 fname:
 626     push        {r4-r12, lr}        /* save all registers */
 627
 628 /*
 629  * Select prefetch type for this function. If prefetch distance is
 630  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
 631  * has to be used instead of ADVANCED.
 632  */
 633     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
 634 .if prefetch_distance == 0
 635     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 636 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
 637         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
 638     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
 639 .endif
 640
 641 /*
 642  * Make some macro arguments globally visible and accessible
 643  * from other macros
 644  */
 645     .set src_bpp, src_bpp_
 646     .set mask_bpp, mask_bpp_
 647     .set dst_w_bpp, dst_w_bpp_
 648     .set pixblock_size, pixblock_size_
 649     .set dst_w_basereg, dst_w_basereg_
 650     .set dst_r_basereg, dst_r_basereg_
 651     .set src_basereg, src_basereg_
 652     .set mask_basereg, mask_basereg_
 653
 654     .macro pixld_src x:vararg
 655         pixld x
 656     .endm
 657     .macro fetch_src_pixblock
 658         pixld_src   pixblock_size, src_bpp, \
 659                     (src_basereg - pixblock_size * src_bpp / 64), SRC
 660     .endm
 661 /*
 662  * Assign symbolic names to registers
 663  */
 664     W           .req        r0      /* width (is updated during processing) */
 665     H           .req        r1      /* height (is updated during processing) */
 666     DST_W       .req        r2      /* destination buffer pointer for writes */
 667     DST_STRIDE  .req        r3      /* destination image stride */
 668     SRC         .req        r4      /* source buffer pointer */
 669     SRC_STRIDE  .req        r5      /* source image stride */
 670     DST_R       .req        r6      /* destination buffer pointer for reads */
 671
 672     MASK        .req        r7      /* mask pointer */
 673     MASK_STRIDE .req        r8      /* mask stride */
 674
 675     PF_CTL      .req        r9      /* combined lines counter and prefetch */
 676                                     /* distance increment counter */
 677     PF_X        .req        r10     /* pixel index in a scanline for current */
 678                                     /* pretetch position */
 679     PF_SRC      .req        r11     /* pointer to source scanline start */
 680                                     /* for prefetch purposes */
 681     PF_DST      .req        r12     /* pointer to destination scanline start */
 682                                     /* for prefetch purposes */
 683     PF_MASK     .req        r14     /* pointer to mask scanline start */
 684                                     /* for prefetch purposes */
 685 /*
 686  * Check whether we have enough registers for all the local variables.
 687  * If we don't have enough registers, original width and height are
 688  * kept on top of stack (and 'regs_shortage' variable is set to indicate
 689  * this for the rest of code). Even if there are enough registers, the
 690  * allocation scheme may be a bit different depending on whether source
 691  * or mask is not used.
 692  */
 693 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
 694     ORIG_W      .req        r10     /* saved original width */
 695     DUMMY       .req        r12     /* temporary register */
 696     .set        regs_shortage, 0
 697 .elseif mask_bpp == 0
 698     ORIG_W      .req        r7      /* saved original width */
 699     DUMMY       .req        r8      /* temporary register */
 700     .set        regs_shortage, 0
 701 .elseif src_bpp == 0
 702     ORIG_W      .req        r4      /* saved original width */
 703     DUMMY       .req        r5      /* temporary register */
 704     .set        regs_shortage, 0
 705 .else
 706     ORIG_W      .req        r1      /* saved original width */
 707     DUMMY       .req        r1      /* temporary register */
 708     .set        regs_shortage, 1
 709 .endif
 710
 711     .set mask_bpp_shift, -1
 712 .if src_bpp == 32
 713     .set src_bpp_shift, 2
 714 .elseif src_bpp == 24
 715     .set src_bpp_shift, 0
 716 .elseif src_bpp == 16
 717     .set src_bpp_shift, 1
 718 .elseif src_bpp == 8
 719     .set src_bpp_shift, 0
 720 .elseif src_bpp == 0
 721     .set src_bpp_shift, -1
 722 .else
 723     .error "requested src bpp (src_bpp) is not supported"
 724 .endif
 725 .if mask_bpp == 32
 726     .set mask_bpp_shift, 2
 727 .elseif mask_bpp == 24
 728     .set mask_bpp_shift, 0
 729 .elseif mask_bpp == 8
 730     .set mask_bpp_shift, 0
 731 .elseif mask_bpp == 0
 732     .set mask_bpp_shift, -1
 733 .else
 734     .error "requested mask bpp (mask_bpp) is not supported"
 735 .endif
 736 .if dst_w_bpp == 32
 737     .set dst_bpp_shift, 2
 738 .elseif dst_w_bpp == 24
 739     .set dst_bpp_shift, 0
 740 .elseif dst_w_bpp == 16
 741     .set dst_bpp_shift, 1
 742 .elseif dst_w_bpp == 8
 743     .set dst_bpp_shift, 0
 744 .else
 745     .error "requested dst bpp (dst_w_bpp) is not supported"
 746 .endif
 747
 748 .if (((flags) & FLAG_DST_READWRITE) != 0)
 749     .set dst_r_bpp, dst_w_bpp
 750 .else
 751     .set dst_r_bpp, 0
 752 .endif
 753 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
 754     .set DEINTERLEAVE_32BPP_ENABLED, 1
 755 .else
 756     .set DEINTERLEAVE_32BPP_ENABLED, 0
 757 .endif
 758
 759 .if prefetch_distance < 0 || prefetch_distance > 15
 760     .error "invalid prefetch distance (prefetch_distance)"
 761 .endif
 762
 763 .if src_bpp > 0
 764     ldr         SRC, [sp, #40]
 765 .endif
 766 .if mask_bpp > 0
 767     ldr         MASK, [sp, #48]
 768 .endif
 769     PF mov      PF_X, #0
 770 .if src_bpp > 0
 771     ldr         SRC_STRIDE, [sp, #44]
 772 .endif
 773 .if mask_bpp > 0
 774     ldr         MASK_STRIDE, [sp, #52]
 775 .endif
 776     mov         DST_R, DST_W
 777
 778 .if src_bpp == 24
 779     sub         SRC_STRIDE, SRC_STRIDE, W
 780     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
 781 .endif
 782 .if mask_bpp == 24
 783     sub         MASK_STRIDE, MASK_STRIDE, W
 784     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
 785 .endif
 786 .if dst_w_bpp == 24
 787     sub         DST_STRIDE, DST_STRIDE, W
 788     sub         DST_STRIDE, DST_STRIDE, W, lsl #1
 789 .endif
 790
 791 /*
 792  * Setup advanced prefetcher initial state
 793  */
 794     PF mov      PF_SRC, SRC
 795     PF mov      PF_DST, DST_R
 796     PF mov      PF_MASK, MASK
 797     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
 798     PF mov      PF_CTL, H, lsl #4
 799     PF add      PF_CTL, #(prefetch_distance - 0x10)
 800
 801     init
 802 .if regs_shortage
 803     push        {r0, r1}
 804 .endif
 805     subs        H, H, #1
 806 .if regs_shortage
 807     str         H, [sp, #4] /* save updated height to stack */
 808 .else
 809     mov         ORIG_W, W
 810 .endif
 811     blt         9f
 812     cmp         W, #(pixblock_size * 2)
 813     blt         8f
 814 /*
 815  * This is the start of the pipelined loop, which if optimized for
 816  * long scanlines
 817  */
 818 0:
 819     ensure_destination_ptr_alignment process_pixblock_head, \
 820                                      process_pixblock_tail, \
 821                                      process_pixblock_tail_head
 822
 823     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
 824     pixld_a     pixblock_size, dst_r_bpp, \
 825                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
 826     fetch_src_pixblock
 827     pixld       pixblock_size, mask_bpp, \
 828                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
 829     PF add      PF_X, PF_X, #pixblock_size
 830     process_pixblock_head
 831     cache_preload 0, pixblock_size
 832     cache_preload_simple
 833     subs        W, W, #(pixblock_size * 2)
 834     blt         2f
 835 1:
 836     process_pixblock_tail_head
 837     cache_preload_simple
 838     subs        W, W, #pixblock_size
 839     bge         1b
 840 2:
 841     process_pixblock_tail
 842     pixst_a     pixblock_size, dst_w_bpp, \
 843                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 844
 845     /* Process the remaining trailing pixels in the scanline */
 846     process_trailing_pixels 1, 1, \
 847                             process_pixblock_head, \
 848                             process_pixblock_tail, \
 849                             process_pixblock_tail_head
 850     advance_to_next_scanline 0b
 851
 852 .if regs_shortage
 853     pop         {r0, r1}
 854 .endif
 855     cleanup
 856     pop         {r4-r12, pc}  /* exit */
 857 /*
 858  * This is the start of the loop, designed to process images with small width
 859  * (less than pixblock_size * 2 pixels). In this case neither pipelining
 860  * nor prefetch are used.
 861  */
 862 8:
 863     /* Process exactly pixblock_size pixels if needed */
 864     tst         W, #pixblock_size
 865     beq         1f
 866     pixld       pixblock_size, dst_r_bpp, \
 867                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
 868     fetch_src_pixblock
 869     pixld       pixblock_size, mask_bpp, \
 870                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
 871     process_pixblock_head
 872     process_pixblock_tail
 873     pixst       pixblock_size, dst_w_bpp, \
 874                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
 875 1:
 876     /* Process the remaining trailing pixels in the scanline */
 877     process_trailing_pixels 0, 0, \
 878                             process_pixblock_head, \
 879                             process_pixblock_tail, \
 880                             process_pixblock_tail_head
 881     advance_to_next_scanline 8b
 882 9:
 883 .if regs_shortage
 884     pop         {r0, r1}
 885 .endif
 886     cleanup
 887     pop         {r4-r12, pc}  /* exit */
 888
 889     .purgem     fetch_src_pixblock
 890     .purgem     pixld_src
 891
 892     .unreq      SRC
 893     .unreq      MASK
 894     .unreq      DST_R
 895     .unreq      DST_W
 896     .unreq      ORIG_W
 897     .unreq      W
 898     .unreq      H
 899     .unreq      SRC_STRIDE
 900     .unreq      DST_STRIDE
 901     .unreq      MASK_STRIDE
 902     .unreq      PF_CTL
 903     .unreq      PF_X
 904     .unreq      PF_SRC
 905     .unreq      PF_DST
 906     .unreq      PF_MASK
 907     .unreq      DUMMY
 908     .endfunc
 909 .endm
 910
 911 /*
 912  * A simplified variant of function generation template for a single
 913  * scanline processing (for implementing pixman combine functions)
 914  */
 915 .macro generate_composite_function_scanline        use_nearest_scaling, \
 916                                                    fname, \
 917                                                    src_bpp_, \
 918                                                    mask_bpp_, \
 919                                                    dst_w_bpp_, \
 920                                                    flags, \
 921                                                    pixblock_size_, \
 922                                                    init, \
 923                                                    cleanup, \
 924                                                    process_pixblock_head, \
 925                                                    process_pixblock_tail, \
 926                                                    process_pixblock_tail_head, \
 927                                                    dst_w_basereg_ = 28, \
 928                                                    dst_r_basereg_ = 4, \
 929                                                    src_basereg_   = 0, \
 930                                                    mask_basereg_  = 24
 931
 932     .func fname
 933     .global fname
 934     /* For ELF format also set function visibility to hidden */
 935 #ifdef __ELF__
 936     .hidden fname
 937     .type fname, %function
 938 #endif
 939 fname:
 940     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
 941 /*
 942  * Make some macro arguments globally visible and accessible
 943  * from other macros
 944  */
 945     .set src_bpp, src_bpp_
 946     .set mask_bpp, mask_bpp_
 947     .set dst_w_bpp, dst_w_bpp_
 948     .set pixblock_size, pixblock_size_
 949     .set dst_w_basereg, dst_w_basereg_
 950     .set dst_r_basereg, dst_r_basereg_
 951     .set src_basereg, src_basereg_
 952     .set mask_basereg, mask_basereg_
 953
 954 .if use_nearest_scaling != 0
 955     /*
 956      * Assign symbolic names to registers for nearest scaling
 957      */
 958     W           .req        r0
 959     DST_W       .req        r1
 960     SRC         .req        r2
 961     VX          .req        r3
 962     UNIT_X      .req        ip
 963     MASK        .req        lr
 964     TMP1        .req        r4
 965     TMP2        .req        r5
 966     DST_R       .req        r6
 967
 968     .macro pixld_src x:vararg
 969         pixld_s x
 970     .endm
 971
 972     ldr         UNIT_X, [sp]
 973     push        {r4-r6, lr}
 974     .if mask_bpp != 0
 975     ldr         MASK, [sp, #(16 + 4)]
 976     .endif
 977 .else
 978     /*
 979      * Assign symbolic names to registers
 980      */
 981     W           .req        r0      /* width (is updated during processing) */
 982     DST_W       .req        r1      /* destination buffer pointer for writes */
 983     SRC         .req        r2      /* source buffer pointer */
 984     DST_R       .req        ip      /* destination buffer pointer for reads */
 985     MASK        .req        r3      /* mask pointer */
 986
 987     .macro pixld_src x:vararg
 988         pixld x
 989     .endm
 990 .endif
 991
 992 .if (((flags) & FLAG_DST_READWRITE) != 0)
 993     .set dst_r_bpp, dst_w_bpp
 994 .else
 995     .set dst_r_bpp, 0
 996 .endif
 997 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
 998     .set DEINTERLEAVE_32BPP_ENABLED, 1
 999 .else
1000     .set DEINTERLEAVE_32BPP_ENABLED, 0
1001 .endif
1002
1003     .macro fetch_src_pixblock
1004         pixld_src   pixblock_size, src_bpp, \
1005                     (src_basereg - pixblock_size * src_bpp / 64), SRC
1006     .endm
1007
1008     init
1009     mov         DST_R, DST_W
1010
1011     cmp         W, #pixblock_size
1012     blt         8f
1013
1014     ensure_destination_ptr_alignment process_pixblock_head, \
1015                                      process_pixblock_tail, \
1016                                      process_pixblock_tail_head
1017
1018     subs        W, W, #pixblock_size
1019     blt         7f
1020
1021     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1022     pixld_a     pixblock_size, dst_r_bpp, \
1023                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
1024     fetch_src_pixblock
1025     pixld       pixblock_size, mask_bpp, \
1026                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
1027     process_pixblock_head
1028     subs        W, W, #pixblock_size
1029     blt         2f
1030 1:
1031     process_pixblock_tail_head
1032     subs        W, W, #pixblock_size
1033     bge         1b
1034 2:
1035     process_pixblock_tail
1036     pixst_a     pixblock_size, dst_w_bpp, \
1037                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
1038 7:
1039     /* Process the remaining trailing pixels in the scanline (dst aligned) */
1040     process_trailing_pixels 0, 1, \
1041                             process_pixblock_head, \
1042                             process_pixblock_tail, \
1043                             process_pixblock_tail_head
1044
1045     cleanup
1046 .if use_nearest_scaling != 0
1047     pop         {r4-r6, pc}  /* exit */
1048 .else
1049     bx          lr  /* exit */
1050 .endif
1051 8:
1052     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1053     process_trailing_pixels 0, 0, \
1054                             process_pixblock_head, \
1055                             process_pixblock_tail, \
1056                             process_pixblock_tail_head
1057
1058     cleanup
1059
1060 .if use_nearest_scaling != 0
1061     pop         {r4-r6, pc}  /* exit */
1062
1063     .unreq      DST_R
1064     .unreq      SRC
1065     .unreq      W
1066     .unreq      VX
1067     .unreq      UNIT_X
1068     .unreq      TMP1
1069     .unreq      TMP2
1070     .unreq      DST_W
1071     .unreq      MASK
1072
1073 .else
1074     bx          lr  /* exit */
1075
1076     .unreq      SRC
1077     .unreq      MASK
1078     .unreq      DST_R
1079     .unreq      DST_W
1080     .unreq      W
1081 .endif
1082
1083     .purgem     fetch_src_pixblock
1084     .purgem     pixld_src
1085
1086     .endfunc
1087 .endm
1088
1089 .macro generate_composite_function_single_scanline x:vararg
1090     generate_composite_function_scanline 0, x
1091 .endm
1092
1093 .macro generate_composite_function_nearest_scanline x:vararg
1094     generate_composite_function_scanline 1, x
1095 .endm
1096
1097 /* Default prologue/epilogue, nothing special needs to be done */
1098
1099 .macro default_init
1100 .endm
1101
1102 .macro default_cleanup
1103 .endm
1104
1105 /*
1106  * Prologue/epilogue variant which additionally saves/restores d8-d15
1107  * registers (they need to be saved/restored by callee according to ABI).
1108  * This is required if the code needs to use all the NEON registers.
1109  */
1110
1111 .macro default_init_need_all_regs
1112     vpush       {d8-d15}
1113 .endm
1114
1115 .macro default_cleanup_need_all_regs
1116     vpop        {d8-d15}
1117 .endm
1118
1119 /******************************************************************************/
1120
1121 /*
1122  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
1123  * into a planar a8r8g8b8 format (with a, r, g, b color components
1124  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
1125  *
1126  * Warning: the conversion is destructive and the original
1127  *          value (in) is lost.
1128  */
1129 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
1130     vshrn.u16   out_r, in,    #8
1131     vshrn.u16   out_g, in,    #3
1132     vsli.u16    in,    in,    #5
1133     vmov.u8     out_a, #255
1134     vsri.u8     out_r, out_r, #5
1135     vsri.u8     out_g, out_g, #6
1136     vshrn.u16   out_b, in,    #2
1137 .endm
1138
1139 .macro convert_0565_to_x888 in, out_r, out_g, out_b
1140     vshrn.u16   out_r, in,    #8
1141     vshrn.u16   out_g, in,    #3
1142     vsli.u16    in,    in,    #5
1143     vsri.u8     out_r, out_r, #5
1144     vsri.u8     out_g, out_g, #6
1145     vshrn.u16   out_b, in,    #2
1146 .endm
1147
1148 /*
1149  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1150  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1151  * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1152  * registers (tmp1, tmp2)
1153  */
1154 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
1155     vshll.u8    tmp1, in_g, #8
1156     vshll.u8    out, in_r, #8
1157     vshll.u8    tmp2, in_b, #8
1158     vsri.u16    out, tmp1, #5
1159     vsri.u16    out, tmp2, #11
1160 .endm
1161
1162 /*
1163  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1164  * returned in (out0, out1) registers pair. Requires one temporary
1165  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1166  * value from 'in' is lost
1167  */
1168 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
1169     vshl.u16    out0, in,   #5  /* G top 6 bits */
1170     vshl.u16    tmp,  in,   #11 /* B top 5 bits */
1171     vsri.u16    in,   in,   #5  /* R is ready in top bits */
1172     vsri.u16    out0, out0, #6  /* G is ready in top bits */
1173     vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
1174     vshr.u16    out1, in,   #8  /* R is in place */
1175     vsri.u16    out0, tmp,  #8  /* G & B is in place */
1176     vzip.u16    out0, out1      /* everything is in place */
1177 .endm