2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
53 //#include "pixman-arm-asm.h"
54 /* Supplementary macro for setting function attributes */
55 .macro pixman_asm_function fname
60 .type fname, %function
65 //#include "pixman-private.h"
67 * The defines which are shared between C and assembly code
70 /* bilinear interpolation precision (must be < 8) */
71 #define BILINEAR_INTERPOLATION_BITS 7
72 #define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
74 #include "pixman-arm-neon-asm.h"
76 /* Global configuration options and preferences */
79 * The code can optionally make use of unaligned memory accesses to improve
80 * performance of handling leading/trailing pixels for each scanline.
81 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
82 * example in linux if unaligned memory accesses are not configured to
83 * generate.exceptions.
85 .set RESPECT_STRICT_ALIGNMENT, 1
88 * Set default prefetch type. There is a choice between the following options:
90 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
91 * as NOP to workaround some HW bugs or for whatever other reason)
93 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
94 * advanced prefetch intruduces heavy overhead)
96 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
97 * which can run ARM and NEON instructions simultaneously so that extra ARM
98 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
100 * Note: some types of function can't support advanced prefetch and fallback
101 * to simple one (those which handle 24bpp pixels)
103 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
105 /* Prefetch distance in pixels for simple prefetch */
106 .set PREFETCH_DISTANCE_SIMPLE, 64
109 * Implementation of pixman_composite_over_8888_0565_asm_neon
111 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
112 * performs OVER compositing operation. Function fast_composite_over_8888_0565
113 * from pixman-fast-path.c does the same in C and can be used as a reference.
115 * First we need to have some NEON assembly code which can do the actual
116 * operation on the pixels and provide it to the template macro.
118 * Template macro quite conveniently takes care of emitting all the necessary
119 * code for memory reading and writing (including quite tricky cases of
120 * handling unaligned leading/trailing pixels), so we only need to deal with
121 * the data in NEON registers.
123 * NEON registers allocation in general is recommented to be the following:
124 * d0, d1, d2, d3 - contain loaded source pixel data
125 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
126 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
127 * d28, d29, d30, d31 - place for storing the result (destination pixels)
129 * As can be seen above, four 64-bit NEON registers are used for keeping
130 * intermediate pixel data and up to 8 pixels can be processed in one step
131 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
133 * This particular function uses the following registers allocation:
134 * d0, d1, d2, d3 - contain loaded source pixel data
135 * d4, d5 - contain loaded destination pixels (they are needed)
136 * d28, d29 - place for storing the result (destination pixels)
140 * Step one. We need to have some code to do some arithmetics on pixel data.
141 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
142 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
143 * perform all the needed calculations and write the result to {d28, d29}.
144 * The rationale for having two macros and not just one will be explained
145 * later. In practice, any single monolitic function which does the work can
146 * be split into two parts in any arbitrary way without affecting correctness.
148 * There is one special trick here too. Common template macro can optionally
149 * make our life a bit easier by doing R, G, B, A color components
150 * deinterleaving for 32bpp pixel formats (and this feature is used in
151 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
152 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
153 * actually use d0 register for blue channel (a vector of eight 8-bit
154 * values), d1 register for green, d2 for red and d3 for alpha. This
155 * simple conversion can be also done with a few NEON instructions:
157 * Packed to planar conversion:
163 * Planar to packed conversion:
169 * But pixel can be loaded directly in planar format using VLD4.8 NEON
170 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
171 * desirable, that's why deinterleaving is optional.
173 * But anyway, here is the code:
177 * OK, now we got almost everything that we need. Using the above two
178 * macros, the work can be done right. But now we want to optimize
179 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
180 * a lot from good code scheduling and software pipelining.
182 * Let's construct some code, which will run in the core main loop.
183 * Some pseudo-code of the main loop will look like this:
191 * It may look a bit weird, but this setup allows to hide instruction
192 * latencies better and also utilize dual-issue capability more
193 * efficiently (make pairs of load-store and ALU instructions).
195 * So what we need now is a '*_tail_head' macro, which will be used
196 * in the core main loop. A trivial straightforward implementation
197 * of this macro would look like this:
199 * pixman_composite_over_8888_0565_process_pixblock_tail
200 * vst1.16 {d28, d29}, [DST_W, :128]!
201 * vld1.16 {d4, d5}, [DST_R, :128]!
202 * vld4.32 {d0, d1, d2, d3}, [SRC]!
203 * pixman_composite_over_8888_0565_process_pixblock_head
206 * Now it also got some VLD/VST instructions. We simply can't move from
207 * processing one block of pixels to the other one with just arithmetics.
208 * The previously processed data needs to be written to memory and new
209 * data needs to be fetched. Fortunately, this main loop does not deal
210 * with partial leading/trailing pixels and can load/store a full block
211 * of pixels in a bulk. Additionally, destination buffer is already
212 * 16 bytes aligned here (which is good for performance).
214 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
215 * are the aliases for ARM registers which are used as pointers for
216 * accessing data. We maintain separate pointers for reading and writing
217 * destination buffer (DST_R and DST_W).
219 * Another new thing is 'cache_preload' macro. It is used for prefetching
220 * data into CPU L2 cache and improve performance when dealing with large
221 * images which are far larger than cache size. It uses one argument
222 * (actually two, but they need to be the same here) - number of pixels
223 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
224 * details about this macro. Moreover, if good performance is needed
225 * the code from this macro needs to be copied into '*_tail_head' macro
226 * and mixed with the rest of code for optimal instructions scheduling.
227 * We are actually doing it below.
229 * Now after all the explanations, here is the optimized code.
230 * Different instruction streams (originaling from '*_head', '*_tail'
231 * and 'cache_preload' macro) use different indentation levels for
232 * better readability. Actually taking the code from one of these
233 * indentation levels and ignoring a few VLD/VST instructions would
234 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
239 * And now the final part. We are using 'generate_composite_function' macro
240 * to put all the stuff together. We are specifying the name of the function
241 * which we want to get, number of bits per pixel for the source, mask and
242 * destination (0 if unused, like mask in this case). Next come some bit
244 * FLAG_DST_READWRITE - tells that the destination buffer is both read
245 * and written, for write-only buffer we would use
246 * FLAG_DST_WRITEONLY flag instead
247 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
248 * and separate color channels for 32bpp format.
249 * The next things are:
250 * - the number of pixels processed per iteration (8 in this case, because
251 * that's the maximum what can fit into four 64-bit NEON registers).
252 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
253 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
254 * prefetch distance can be selected by running some benchmarks.
256 * After that we specify some macros, these are 'default_init',
257 * 'default_cleanup' here which are empty (but it is possible to have custom
258 * init/cleanup macros to be able to save/restore some extra NEON registers
259 * like d8-d15 or do anything else) followed by
260 * 'pixman_composite_over_8888_0565_process_pixblock_head',
261 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
262 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
263 * which we got implemented above.
265 * The last part is the NEON registers allocation scheme.
268 /******************************************************************************/
270 /******************************************************************************/
271 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
272 vmvn.8 d24, d3 /* get inverted alpha */
273 /* do alpha blending */
276 vmull.u8 q10, d24, d6
277 vmull.u8 q11, d24, d7
280 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
281 vrshr.u16 q14, q8, #8
282 vrshr.u16 q15, q9, #8
283 vrshr.u16 q12, q10, #8
284 vrshr.u16 q13, q11, #8
285 vraddhn.u16 d28, q14, q8
286 vraddhn.u16 d29, q15, q9
287 vraddhn.u16 d30, q12, q10
288 vraddhn.u16 d31, q13, q11
291 /******************************************************************************/
293 .macro pixman_composite_over_8888_8888_process_pixblock_head
294 pixman_composite_out_reverse_8888_8888_process_pixblock_head
297 .macro pixman_composite_over_8888_8888_process_pixblock_tail
298 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
299 vqadd.u8 q14, q0, q14
300 vqadd.u8 q15, q1, q15
303 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
304 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
305 vrshr.u16 q14, q8, #8
306 PF add PF_X, PF_X, #8
308 vrshr.u16 q15, q9, #8
309 vrshr.u16 q12, q10, #8
310 vrshr.u16 q13, q11, #8
311 PF addne PF_X, PF_X, #8
312 PF subne PF_CTL, PF_CTL, #1
313 vraddhn.u16 d28, q14, q8
314 vraddhn.u16 d29, q15, q9
316 vraddhn.u16 d30, q12, q10
317 vraddhn.u16 d31, q13, q11
318 vqadd.u8 q14, q0, q14
319 vqadd.u8 q15, q1, q15
321 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
323 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
324 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
325 PF subge PF_X, PF_X, ORIG_W
327 PF subges PF_CTL, PF_CTL, #0x10
329 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
330 vmull.u8 q10, d22, d6
331 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
332 vmull.u8 q11, d22, d7
335 generate_composite_function \
336 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
337 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
338 8, /* number of pixels, processed in a single block */ \
339 5, /* prefetch distance */ \
342 pixman_composite_over_8888_8888_process_pixblock_head, \
343 pixman_composite_over_8888_8888_process_pixblock_tail, \
344 pixman_composite_over_8888_8888_process_pixblock_tail_head
346 generate_composite_function_single_scanline \
347 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
348 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
349 8, /* number of pixels, processed in a single block */ \
352 pixman_composite_over_8888_8888_process_pixblock_head, \
353 pixman_composite_over_8888_8888_process_pixblock_tail, \
354 pixman_composite_over_8888_8888_process_pixblock_tail_head
356 /******************************************************************************/
358 .macro pixman_composite_over_n_8888_process_pixblock_head
359 /* deinterleaved source pixels in {d0, d1, d2, d3} */
360 /* inverted alpha in {d24} */
361 /* destination pixels in {d4, d5, d6, d7} */
364 vmull.u8 q10, d24, d6
365 vmull.u8 q11, d24, d7
368 .macro pixman_composite_over_n_8888_process_pixblock_tail
369 vrshr.u16 q14, q8, #8
370 vrshr.u16 q15, q9, #8
371 vrshr.u16 q2, q10, #8
372 vrshr.u16 q3, q11, #8
373 vraddhn.u16 d28, q14, q8
374 vraddhn.u16 d29, q15, q9
375 vraddhn.u16 d30, q2, q10
376 vraddhn.u16 d31, q3, q11
377 vqadd.u8 q14, q0, q14
378 vqadd.u8 q15, q1, q15
381 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
382 vrshr.u16 q14, q8, #8
383 vrshr.u16 q15, q9, #8
384 vrshr.u16 q2, q10, #8
385 vrshr.u16 q3, q11, #8
386 vraddhn.u16 d28, q14, q8
387 vraddhn.u16 d29, q15, q9
388 vraddhn.u16 d30, q2, q10
389 vraddhn.u16 d31, q3, q11
390 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
391 vqadd.u8 q14, q0, q14
392 PF add PF_X, PF_X, #8
394 PF addne PF_X, PF_X, #8
395 PF subne PF_CTL, PF_CTL, #1
396 vqadd.u8 q15, q1, q15
399 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
401 PF subge PF_X, PF_X, ORIG_W
402 vmull.u8 q10, d24, d6
403 PF subges PF_CTL, PF_CTL, #0x10
404 vmull.u8 q11, d24, d7
405 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
406 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
409 .macro pixman_composite_over_n_8888_init
410 add DUMMY, sp, #ARGS_STACK_OFFSET
411 vld1.32 {d3[0]}, [DUMMY]
416 vmvn.8 d24, d3 /* get inverted alpha */
419 generate_composite_function \
420 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
421 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
422 8, /* number of pixels, processed in a single block */ \
423 5, /* prefetch distance */ \
424 pixman_composite_over_n_8888_init, \
426 pixman_composite_over_8888_8888_process_pixblock_head, \
427 pixman_composite_over_8888_8888_process_pixblock_tail, \
428 pixman_composite_over_n_8888_process_pixblock_tail_head
430 /******************************************************************************/
432 .macro pixman_composite_src_n_8888_process_pixblock_head
435 .macro pixman_composite_src_n_8888_process_pixblock_tail
438 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
439 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
442 .macro pixman_composite_src_n_8888_init
443 add DUMMY, sp, #ARGS_STACK_OFFSET
444 vld1.32 {d0[0]}, [DUMMY]
450 .macro pixman_composite_src_n_8888_cleanup
453 generate_composite_function \
454 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
455 FLAG_DST_WRITEONLY, \
456 8, /* number of pixels, processed in a single block */ \
457 0, /* prefetch distance */ \
458 pixman_composite_src_n_8888_init, \
459 pixman_composite_src_n_8888_cleanup, \
460 pixman_composite_src_n_8888_process_pixblock_head, \
461 pixman_composite_src_n_8888_process_pixblock_tail, \
462 pixman_composite_src_n_8888_process_pixblock_tail_head, \
463 0, /* dst_w_basereg */ \
464 0, /* dst_r_basereg */ \
465 0, /* src_basereg */ \
468 /******************************************************************************/
470 .macro pixman_composite_src_8888_8888_process_pixblock_head
473 .macro pixman_composite_src_8888_8888_process_pixblock_tail
476 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
477 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
482 generate_composite_function \
483 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
484 FLAG_DST_WRITEONLY, \
485 8, /* number of pixels, processed in a single block */ \
486 10, /* prefetch distance */ \
489 pixman_composite_src_8888_8888_process_pixblock_head, \
490 pixman_composite_src_8888_8888_process_pixblock_tail, \
491 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
492 0, /* dst_w_basereg */ \
493 0, /* dst_r_basereg */ \
494 0, /* src_basereg */ \
497 /******************************************************************************/