2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * Copyright (c) 2018 RISC OS Open Ltd
29 * This software is provided 'as-is', without any express or implied
30 * warranty. In no event will the authors be held liable for any damages
31 * arising from the use of this software.
33 * Permission is granted to anyone to use this software for any purpose,
34 * including commercial applications, and to alter it and redistribute it
35 * freely, subject to the following restrictions:
37 * 1. The origin of this software must not be misrepresented; you must not
38 * claim that you wrote the original software. If you use this software
39 * in a product, an acknowledgment in the product documentation would be
40 * appreciated but is not required.
41 * 2. Altered source versions must be plainly marked as such, and must not be
42 * misrepresented as being the original software.
43 * 3. This notice may not be removed or altered from any source distribution.
46 /* Prevent the stack from becoming executable for no reason... */
47 #if defined(__linux__) && defined(__ELF__)
48 .section .note.GNU-stack,"",%progbits
55 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
56 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
61 #include "pixman-arm-asm.h"
62 #include "pixman-arm-neon-asm.h"
64 /* Global configuration options and preferences */
67 * The code can optionally make use of unaligned memory accesses to improve
68 * performance of handling leading/trailing pixels for each scanline.
69 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
70 * example in linux if unaligned memory accesses are not configured to
71 * generate.exceptions.
73 .set RESPECT_STRICT_ALIGNMENT, 1
76 * Set default prefetch type. There is a choice between the following options:
78 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
79 * as NOP to workaround some HW bugs or for whatever other reason)
81 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
82 * advanced prefetch intruduces heavy overhead)
84 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
85 * which can run ARM and NEON instructions simultaneously so that extra ARM
86 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
88 * Note: some types of function can't support advanced prefetch and fallback
89 * to simple one (those which handle 24bpp pixels)
91 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
93 /* Prefetch distance in pixels for simple prefetch */
94 .set PREFETCH_DISTANCE_SIMPLE, 64
96 /******************************************************************************/
98 /* We can actually do significantly better than the Pixman macros, at least for
99 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
100 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
103 .macro generate_fillrect_function name, bpp, log2Bpp
105 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
109 * a3 = pointer to top-left destination pixel
110 * a4 = stride, pixels
111 * [sp] = pixel value to fill with
112 * Within the function:
113 * v1 = width remaining
115 * v3 = alternate pointer
116 * ip = data ARM register
118 pixman_asm_function name
119 vld1.\bpp {d0[],d1[]}, [sp]
121 vld1.\bpp {d2[],d3[]}, [sp]
122 cmp a1, #(15+64) >> \log2Bpp
133 rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */
134 sub v1, v1, v3, lsr #\log2Bpp
138 tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
142 tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
149 2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
152 3: vst1.\bpp {q0-q1}, [a3 :128], v2
153 subs v1, v1, #64 >> \log2Bpp
154 vst1.\bpp {q0-q1}, [v3 :128], v2
156 /* Trailing pixels */
157 4: movs v1, v1, lsl #27 + \log2Bpp
159 vst1.\bpp {q0-q1}, [a3 :128]!
161 vst1.\bpp {q0}, [a3 :128]!
162 6: movs v1, v1, lsl #2
173 add a3, a3, a4, lsl #\log2Bpp
192 53: cmp v1, #32 >> \log2Bpp
194 vst1.\bpp {q0-q1}, [a3]!
195 sub v1, v1, #32 >> \log2Bpp
196 /* Trailing pixels */
197 54: movs v1, v1, lsl #27 + \log2Bpp
199 vst1.\bpp {q0-q1}, [a3]!
201 vst1.\bpp {q0}, [a3]!
202 56: movs v1, v1, lsl #2
213 add a3, a3, a4, lsl #\log2Bpp
220 generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
221 generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
222 generate_fillrect_function FillRect8ARMNEONAsm, 8, 0
224 /******************************************************************************/
226 .macro RGBtoRGBPixelAlpha_process_pixblock_head
227 vmvn d30, d3 /* get inverted source alpha */
228 vmov d31, d7 /* dest alpha is always unchanged */
230 vmlal.u8 q14, d4, d30
235 vrshr.u16 q2, q14, #8
237 vraddhn.u16 d28, q14, q2
239 vraddhn.u16 d29, q0, q3
240 vraddhn.u16 d30, q1, q2
243 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
247 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
248 vld4.8 {d0-d3}, [SRC]!
249 PF add PF_X, PF_X, #8
250 vst4.8 {d28-d31}, [DST_W :128]!
252 vld4.8 {d4-d7}, [DST_R :128]!
253 PF addne PF_X, PF_X, #8
254 vmvn d30, d3 /* get inverted source alpha */
255 vmov d31, d7 /* dest alpha is always unchanged */
257 PF subne PF_CTL, PF_CTL, #1
258 vmlal.u8 q14, d4, d30
261 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
263 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
265 PF subge PF_X, PF_X, ORIG_W
267 PF subges PF_CTL, PF_CTL, #0x10
268 vrshr.u16 q2, q14, #8
269 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
271 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
272 vraddhn.u16 d28, q14, q2
274 vraddhn.u16 d29, q0, q3
275 vraddhn.u16 d30, q1, q2
278 generate_composite_function \
279 BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
280 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
281 8, /* number of pixels, processed in a single block */ \
282 5, /* prefetch distance */ \
285 RGBtoRGBPixelAlpha_process_pixblock_head, \
286 RGBtoRGBPixelAlpha_process_pixblock_tail, \
287 RGBtoRGBPixelAlpha_process_pixblock_tail_head
289 /******************************************************************************/
291 .macro ARGBto565PixelAlpha_process_pixblock_head
297 vshrn.u16 d25, q2, #8
307 vmlal.u8 q14, d24, d6
309 vmlal.u8 q15, d25, d6
312 .macro ARGBto565PixelAlpha_process_pixblock_tail
319 vsli.u16 q14, q13, #5
320 vsli.u16 q14, q15, #11
323 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
324 vld4.8 {d0-d3}, [SRC]!
325 PF add PF_X, PF_X, #8
329 PF addne PF_X, PF_X, #8
331 PF subne PF_CTL, PF_CTL, #1
335 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
337 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
338 vld1.8 {d4-d5}, [DST_R]!
339 PF subge PF_X, PF_X, ORIG_W
340 vsli.u16 q14, q13, #5
341 PF subges PF_CTL, PF_CTL, #0x10
342 vsli.u16 q14, q15, #11
343 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
344 vst1.8 {q14}, [DST_W :128]!
350 vshrn.u16 d25, q2, #8
352 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
361 vmlal.u8 q14, d24, d6
363 vmlal.u8 q15, d25, d6
366 generate_composite_function \
367 BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
368 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
369 8, /* number of pixels, processed in a single block */ \
370 6, /* prefetch distance */ \
373 ARGBto565PixelAlpha_process_pixblock_head, \
374 ARGBto565PixelAlpha_process_pixblock_tail, \
375 ARGBto565PixelAlpha_process_pixblock_tail_head