src/vector/pixman-arm-neon-asm.S

   1 /*
   2  * Copyright © 2009 Nokia Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  *
  23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
  24  */
  25
  26 /*
  27  * This file contains implementations of NEON optimized pixel processing
  28  * functions. There is no full and detailed tutorial, but some functions
  29  * (those which are exposing some new or interesting features) are
  30  * extensively commented and can be used as examples.
  31  *
  32  * You may want to have a look at the comments for following functions:
  33  *  - pixman_composite_over_8888_0565_asm_neon
  34  *  - pixman_composite_over_n_8_0565_asm_neon
  35  */
  36
  37 /* Prevent the stack from becoming executable for no reason... */
  38 #if defined(__linux__) && defined(__ELF__)
  39 .section .note.GNU-stack,"",%progbits
  40 #endif
  41
  42     .text
  43     .fpu neon
  44     .arch armv7a
  45     .object_arch armv4
  46     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
  47     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
  48     .arm
  49     .altmacro
  50     .p2align 2
  51
  52
  53 //#include "pixman-arm-asm.h"
  54 /* Supplementary macro for setting function attributes */
  55 .macro pixman_asm_function fname
  56     .func fname
  57     .global fname
  58 #ifdef __ELF__
  59     .hidden fname
  60     .type fname, %function
  61 #endif
  62 fname:
  63 .endm
  64
  65 //#include "pixman-private.h"
  66 /*
  67  * The defines which are shared between C and assembly code
  68  */
  69
  70 /* bilinear interpolation precision (must be < 8) */
  71 #define BILINEAR_INTERPOLATION_BITS 7
  72 #define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
  73
  74 #include "pixman-arm-neon-asm.h"
  75
  76 /* Global configuration options and preferences */
  77
  78 /*
  79  * The code can optionally make use of unaligned memory accesses to improve
  80  * performance of handling leading/trailing pixels for each scanline.
  81  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
  82  * example in linux if unaligned memory accesses are not configured to
  83  * generate.exceptions.
  84  */
  85 .set RESPECT_STRICT_ALIGNMENT, 1
  86
  87 /*
  88  * Set default prefetch type. There is a choice between the following options:
  89  *
  90  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
  91  * as NOP to workaround some HW bugs or for whatever other reason)
  92  *
  93  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
  94  * advanced prefetch intruduces heavy overhead)
  95  *
  96  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
  97  * which can run ARM and NEON instructions simultaneously so that extra ARM
  98  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
  99  *
 100  * Note: some types of function can't support advanced prefetch and fallback
 101  *       to simple one (those which handle 24bpp pixels)
 102  */
 103 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
 104
 105 /* Prefetch distance in pixels for simple prefetch */
 106 .set PREFETCH_DISTANCE_SIMPLE, 64
 107
 108 /*
 109  * Implementation of pixman_composite_over_8888_0565_asm_neon
 110  *
 111  * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
 112  * performs OVER compositing operation. Function fast_composite_over_8888_0565
 113  * from pixman-fast-path.c does the same in C and can be used as a reference.
 114  *
 115  * First we need to have some NEON assembly code which can do the actual
 116  * operation on the pixels and provide it to the template macro.
 117  *
 118  * Template macro quite conveniently takes care of emitting all the necessary
 119  * code for memory reading and writing (including quite tricky cases of
 120  * handling unaligned leading/trailing pixels), so we only need to deal with
 121  * the data in NEON registers.
 122  *
 123  * NEON registers allocation in general is recommented to be the following:
 124  * d0,  d1,  d2,  d3  - contain loaded source pixel data
 125  * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
 126  * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
 127  * d28, d29, d30, d31 - place for storing the result (destination pixels)
 128  *
 129  * As can be seen above, four 64-bit NEON registers are used for keeping
 130  * intermediate pixel data and up to 8 pixels can be processed in one step
 131  * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
 132  *
 133  * This particular function uses the following registers allocation:
 134  * d0,  d1,  d2,  d3  - contain loaded source pixel data
 135  * d4,  d5            - contain loaded destination pixels (they are needed)
 136  * d28, d29           - place for storing the result (destination pixels)
 137  */
 138
 139 /*
 140  * Step one. We need to have some code to do some arithmetics on pixel data.
 141  * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
 142  * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
 143  * perform all the needed calculations and write the result to {d28, d29}.
 144  * The rationale for having two macros and not just one will be explained
 145  * later. In practice, any single monolitic function which does the work can
 146  * be split into two parts in any arbitrary way without affecting correctness.
 147  *
 148  * There is one special trick here too. Common template macro can optionally
 149  * make our life a bit easier by doing R, G, B, A color components
 150  * deinterleaving for 32bpp pixel formats (and this feature is used in
 151  * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
 152  * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
 153  * actually use d0 register for blue channel (a vector of eight 8-bit
 154  * values), d1 register for green, d2 for red and d3 for alpha. This
 155  * simple conversion can be also done with a few NEON instructions:
 156  *
 157  * Packed to planar conversion:
 158  *  vuzp.8 d0, d1
 159  *  vuzp.8 d2, d3
 160  *  vuzp.8 d1, d3
 161  *  vuzp.8 d0, d2
 162  *
 163  * Planar to packed conversion:
 164  *  vzip.8 d0, d2
 165  *  vzip.8 d1, d3
 166  *  vzip.8 d2, d3
 167  *  vzip.8 d0, d1
 168  *
 169  * But pixel can be loaded directly in planar format using VLD4.8 NEON
 170  * instruction. It is 1 cycle slower than VLD1.32, so this is not always
 171  * desirable, that's why deinterleaving is optional.
 172  *
 173  * But anyway, here is the code:
 174  */
 175
 176 /*
 177  * OK, now we got almost everything that we need. Using the above two
 178  * macros, the work can be done right. But now we want to optimize
 179  * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
 180  * a lot from good code scheduling and software pipelining.
 181  *
 182  * Let's construct some code, which will run in the core main loop.
 183  * Some pseudo-code of the main loop will look like this:
 184  *   head
 185  *   while (...) {
 186  *     tail
 187  *     head
 188  *   }
 189  *   tail
 190  *
 191  * It may look a bit weird, but this setup allows to hide instruction
 192  * latencies better and also utilize dual-issue capability more
 193  * efficiently (make pairs of load-store and ALU instructions).
 194  *
 195  * So what we need now is a '*_tail_head' macro, which will be used
 196  * in the core main loop. A trivial straightforward implementation
 197  * of this macro would look like this:
 198  *
 199  *   pixman_composite_over_8888_0565_process_pixblock_tail
 200  *   vst1.16     {d28, d29}, [DST_W, :128]!
 201  *   vld1.16     {d4, d5}, [DST_R, :128]!
 202  *   vld4.32     {d0, d1, d2, d3}, [SRC]!
 203  *   pixman_composite_over_8888_0565_process_pixblock_head
 204  *   cache_preload 8, 8
 205  *
 206  * Now it also got some VLD/VST instructions. We simply can't move from
 207  * processing one block of pixels to the other one with just arithmetics.
 208  * The previously processed data needs to be written to memory and new
 209  * data needs to be fetched. Fortunately, this main loop does not deal
 210  * with partial leading/trailing pixels and can load/store a full block
 211  * of pixels in a bulk. Additionally, destination buffer is already
 212  * 16 bytes aligned here (which is good for performance).
 213  *
 214  * New things here are DST_R, DST_W, SRC and MASK identifiers. These
 215  * are the aliases for ARM registers which are used as pointers for
 216  * accessing data. We maintain separate pointers for reading and writing
 217  * destination buffer (DST_R and DST_W).
 218  *
 219  * Another new thing is 'cache_preload' macro. It is used for prefetching
 220  * data into CPU L2 cache and improve performance when dealing with large
 221  * images which are far larger than cache size. It uses one argument
 222  * (actually two, but they need to be the same here) - number of pixels
 223  * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
 224  * details about this macro. Moreover, if good performance is needed
 225  * the code from this macro needs to be copied into '*_tail_head' macro
 226  * and mixed with the rest of code for optimal instructions scheduling.
 227  * We are actually doing it below.
 228  *
 229  * Now after all the explanations, here is the optimized code.
 230  * Different instruction streams (originaling from '*_head', '*_tail'
 231  * and 'cache_preload' macro) use different indentation levels for
 232  * better readability. Actually taking the code from one of these
 233  * indentation levels and ignoring a few VLD/VST instructions would
 234  * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
 235  * macro!
 236  */
 237
 238 /*
 239  * And now the final part. We are using 'generate_composite_function' macro
 240  * to put all the stuff together. We are specifying the name of the function
 241  * which we want to get, number of bits per pixel for the source, mask and
 242  * destination (0 if unused, like mask in this case). Next come some bit
 243  * flags:
 244  *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
 245  *                             and written, for write-only buffer we would use
 246  *                             FLAG_DST_WRITEONLY flag instead
 247  *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
 248  *                             and separate color channels for 32bpp format.
 249  * The next things are:
 250  *  - the number of pixels processed per iteration (8 in this case, because
 251  *    that's the maximum what can fit into four 64-bit NEON registers).
 252  *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
 253  *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
 254  *    prefetch distance can be selected by running some benchmarks.
 255  *
 256  * After that we specify some macros, these are 'default_init',
 257  * 'default_cleanup' here which are empty (but it is possible to have custom
 258  * init/cleanup macros to be able to save/restore some extra NEON registers
 259  * like d8-d15 or do anything else) followed by
 260  * 'pixman_composite_over_8888_0565_process_pixblock_head',
 261  * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
 262  * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
 263  * which we got implemented above.
 264  *
 265  * The last part is the NEON registers allocation scheme.
 266  */
 267
 268 /******************************************************************************/
 269
 270 /******************************************************************************/
 271  .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
 272      vmvn.8      d24, d3  /* get inverted alpha */
 273      /* do alpha blending */
 274      vmull.u8    q8, d24, d4
 275      vmull.u8    q9, d24, d5
 276      vmull.u8    q10, d24, d6
 277      vmull.u8    q11, d24, d7
 278  .endm
 279
 280  .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
 281      vrshr.u16   q14, q8, #8
 282      vrshr.u16   q15, q9, #8
 283      vrshr.u16   q12, q10, #8
 284      vrshr.u16   q13, q11, #8
 285      vraddhn.u16 d28, q14, q8
 286      vraddhn.u16 d29, q15, q9
 287      vraddhn.u16 d30, q12, q10
 288      vraddhn.u16 d31, q13, q11
 289  .endm
 290
 291 /******************************************************************************/
 292
 293 .macro pixman_composite_over_8888_8888_process_pixblock_head
 294     pixman_composite_out_reverse_8888_8888_process_pixblock_head
 295 .endm
 296
 297 .macro pixman_composite_over_8888_8888_process_pixblock_tail
 298     pixman_composite_out_reverse_8888_8888_process_pixblock_tail
 299     vqadd.u8    q14, q0, q14
 300     vqadd.u8    q15, q1, q15
 301 .endm
 302
 303 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
 304     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
 305         vrshr.u16   q14, q8, #8
 306                                     PF add PF_X, PF_X, #8
 307                                     PF tst PF_CTL, #0xF
 308         vrshr.u16   q15, q9, #8
 309         vrshr.u16   q12, q10, #8
 310         vrshr.u16   q13, q11, #8
 311                                     PF addne PF_X, PF_X, #8
 312                                     PF subne PF_CTL, PF_CTL, #1
 313         vraddhn.u16 d28, q14, q8
 314         vraddhn.u16 d29, q15, q9
 315                                     PF cmp PF_X, ORIG_W
 316         vraddhn.u16 d30, q12, q10
 317         vraddhn.u16 d31, q13, q11
 318         vqadd.u8    q14, q0, q14
 319         vqadd.u8    q15, q1, q15
 320     fetch_src_pixblock
 321                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
 322     vmvn.8      d22, d3
 323                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 324         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
 325                                     PF subge PF_X, PF_X, ORIG_W
 326     vmull.u8    q8, d22, d4
 327                                     PF subges PF_CTL, PF_CTL, #0x10
 328     vmull.u8    q9, d22, d5
 329                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
 330     vmull.u8    q10, d22, d6
 331                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 332     vmull.u8    q11, d22, d7
 333 .endm
 334
 335 generate_composite_function \
 336     pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
 337     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
 338     8, /* number of pixels, processed in a single block */ \
 339     5, /* prefetch distance */ \
 340     default_init, \
 341     default_cleanup, \
 342     pixman_composite_over_8888_8888_process_pixblock_head, \
 343     pixman_composite_over_8888_8888_process_pixblock_tail, \
 344     pixman_composite_over_8888_8888_process_pixblock_tail_head
 345
 346 generate_composite_function_single_scanline \
 347     pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
 348     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
 349     8, /* number of pixels, processed in a single block */ \
 350     default_init, \
 351     default_cleanup, \
 352     pixman_composite_over_8888_8888_process_pixblock_head, \
 353     pixman_composite_over_8888_8888_process_pixblock_tail, \
 354     pixman_composite_over_8888_8888_process_pixblock_tail_head
 355
 356 /******************************************************************************/
 357
 358 .macro pixman_composite_over_n_8888_process_pixblock_head
 359     /* deinterleaved source pixels in {d0, d1, d2, d3} */
 360     /* inverted alpha in {d24} */
 361     /* destination pixels in {d4, d5, d6, d7} */
 362     vmull.u8    q8, d24, d4
 363     vmull.u8    q9, d24, d5
 364     vmull.u8    q10, d24, d6
 365     vmull.u8    q11, d24, d7
 366 .endm
 367
 368 .macro pixman_composite_over_n_8888_process_pixblock_tail
 369     vrshr.u16   q14, q8, #8
 370     vrshr.u16   q15, q9, #8
 371     vrshr.u16   q2, q10, #8
 372     vrshr.u16   q3, q11, #8
 373     vraddhn.u16 d28, q14, q8
 374     vraddhn.u16 d29, q15, q9
 375     vraddhn.u16 d30, q2, q10
 376     vraddhn.u16 d31, q3, q11
 377     vqadd.u8    q14, q0, q14
 378     vqadd.u8    q15, q1, q15
 379 .endm
 380
 381 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
 382         vrshr.u16   q14, q8, #8
 383         vrshr.u16   q15, q9, #8
 384         vrshr.u16   q2, q10, #8
 385         vrshr.u16   q3, q11, #8
 386         vraddhn.u16 d28, q14, q8
 387         vraddhn.u16 d29, q15, q9
 388         vraddhn.u16 d30, q2, q10
 389         vraddhn.u16 d31, q3, q11
 390     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
 391         vqadd.u8    q14, q0, q14
 392                                     PF add PF_X, PF_X, #8
 393                                     PF tst PF_CTL, #0x0F
 394                                     PF addne PF_X, PF_X, #8
 395                                     PF subne PF_CTL, PF_CTL, #1
 396         vqadd.u8    q15, q1, q15
 397                                     PF cmp PF_X, ORIG_W
 398     vmull.u8    q8, d24, d4
 399                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
 400     vmull.u8    q9, d24, d5
 401                                     PF subge PF_X, PF_X, ORIG_W
 402     vmull.u8    q10, d24, d6
 403                                     PF subges PF_CTL, PF_CTL, #0x10
 404     vmull.u8    q11, d24, d7
 405                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
 406         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
 407 .endm
 408
 409 .macro pixman_composite_over_n_8888_init
 410     add         DUMMY, sp, #ARGS_STACK_OFFSET
 411     vld1.32     {d3[0]}, [DUMMY]
 412     vdup.8      d0, d3[0]
 413     vdup.8      d1, d3[1]
 414     vdup.8      d2, d3[2]
 415     vdup.8      d3, d3[3]
 416     vmvn.8      d24, d3  /* get inverted alpha */
 417 .endm
 418
 419 generate_composite_function \
 420     pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
 421     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
 422     8, /* number of pixels, processed in a single block */ \
 423     5, /* prefetch distance */ \
 424     pixman_composite_over_n_8888_init, \
 425     default_cleanup, \
 426     pixman_composite_over_8888_8888_process_pixblock_head, \
 427     pixman_composite_over_8888_8888_process_pixblock_tail, \
 428     pixman_composite_over_n_8888_process_pixblock_tail_head
 429
 430 /******************************************************************************/
 431
 432 .macro pixman_composite_src_n_8888_process_pixblock_head
 433 .endm
 434
 435 .macro pixman_composite_src_n_8888_process_pixblock_tail
 436 .endm
 437
 438 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
 439     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
 440 .endm
 441
 442 .macro pixman_composite_src_n_8888_init
 443     add         DUMMY, sp, #ARGS_STACK_OFFSET
 444     vld1.32     {d0[0]}, [DUMMY]
 445     vsli.u64    d0, d0, #32
 446     vorr        d1, d0, d0
 447     vorr        q1, q0, q0
 448 .endm
 449
 450 .macro pixman_composite_src_n_8888_cleanup
 451 .endm
 452
 453 generate_composite_function \
 454     pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
 455     FLAG_DST_WRITEONLY, \
 456     8, /* number of pixels, processed in a single block */ \
 457     0, /* prefetch distance */ \
 458     pixman_composite_src_n_8888_init, \
 459     pixman_composite_src_n_8888_cleanup, \
 460     pixman_composite_src_n_8888_process_pixblock_head, \
 461     pixman_composite_src_n_8888_process_pixblock_tail, \
 462     pixman_composite_src_n_8888_process_pixblock_tail_head, \
 463     0, /* dst_w_basereg */ \
 464     0, /* dst_r_basereg */ \
 465     0, /* src_basereg   */ \
 466     0  /* mask_basereg  */
 467
 468 /******************************************************************************/
 469
 470 .macro pixman_composite_src_8888_8888_process_pixblock_head
 471 .endm
 472
 473 .macro pixman_composite_src_8888_8888_process_pixblock_tail
 474 .endm
 475
 476 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
 477     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
 478     fetch_src_pixblock
 479     cache_preload 8, 8
 480 .endm
 481
 482 generate_composite_function \
 483     pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
 484     FLAG_DST_WRITEONLY, \
 485     8, /* number of pixels, processed in a single block */ \
 486     10, /* prefetch distance */ \
 487     default_init, \
 488     default_cleanup, \
 489     pixman_composite_src_8888_8888_process_pixblock_head, \
 490     pixman_composite_src_8888_8888_process_pixblock_tail, \
 491     pixman_composite_src_8888_8888_process_pixblock_tail_head, \
 492     0, /* dst_w_basereg */ \
 493     0, /* dst_r_basereg */ \
 494     0, /* src_basereg   */ \
 495     0  /* mask_basereg  */
 496
 497 /******************************************************************************/