2 * Copyright (C) 2011 University of Szeged
3 * Copyright (C) 2011 Zoltan Herczeg
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "FEGaussianBlurNEON.h"
30 #if CPU(ARM_NEON) && COMPILER(GCC)
32 #include <wtf/Alignment.h>
36 static WTF_ALIGNED(unsigned char, s_FEGaussianBlurConstantsForNeon[], 16) = {
37 // Mapping from NEON to ARM registers.
38 0, 4, 8, 12, 16, 16, 16, 16
41 unsigned char* feGaussianBlurConstantsForNeon()
43 return s_FEGaussianBlurConstantsForNeon;
46 #define ASSTRING(str) #str
47 #define TOSTRING(value) ASSTRING(value)
49 #define STRIDE_OFFSET TOSTRING(0)
50 #define STRIDE_WIDTH_OFFSET TOSTRING(4)
51 #define STRIDE_LINE_OFFSET TOSTRING(8)
52 #define STRIDE_LINE_WIDTH_OFFSET TOSTRING(12)
53 #define REMAINING_STRIDES_OFFSET TOSTRING(16)
54 #define DISTANCE_LEFT_OFFSET TOSTRING(20)
55 #define DISTANCE_RIGHT_OFFSET TOSTRING(24)
56 #define INVERTED_KERNEL_SIZE_OFFSET TOSTRING(28)
57 #define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
60 // Register allocation.
62 #define DESTINATION_R "r1"
65 #define SOURCE_END_R "r4"
66 #define DESTINATION_END_R "r5"
68 #define STRIDE_WIDTH_R "r7"
69 #define STRIDE_LINE_R "r8"
70 #define SOURCE_LINE_END_R "r10"
71 #define DISTANCE_LEFT_R "r11"
72 #define DISTANCE_RIGHT_R "r12"
73 #define MAX_KERNEL_SIZE_R "lr"
76 #define INIT_INVERTED_KERNEL_SIZE_R SOURCE_END_R
77 #define INIT_PAINTING_CONSTANTS_R DESTINATION_END_R
78 #define INIT_SUM_R LEFT_R
79 #define REMAINING_STRIDES_R SOURCE_LINE_END_R
81 #define INVERTED_KERNEL_SIZE_Q "q0"
86 #define PIXEL_D00 "d4[0]"
87 #define PIXEL_D01 "d4[1]"
89 #define PIXEL_D10 "d5[0]"
90 #define PIXEL_S2 "s10"
91 #define PIXEL_D11 "d5[1]"
92 #define REMAINING_STRIDES_S0 "s12"
94 #define REMAP_NEON_ARM_Q "d16"
97 ".globl " TOSTRING(neonDrawAllChannelGaussianBlur) NL
98 TOSTRING(neonDrawAllChannelGaussianBlur) ":" NL
99 "stmdb sp!, {r4-r8, r10, r11, lr}" NL
100 "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
101 "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
102 "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
103 "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
104 "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
105 "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
106 "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
107 "ldr " INIT_PAINTING_CONSTANTS_R ", [r2, #" PAINTING_CONSTANTS_OFFSET "]" NL
109 // Initialize locals.
110 "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
111 "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
112 "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
113 "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
114 "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
115 "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
116 "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
117 "vld1.f32 { " REMAP_NEON_ARM_Q " }, [" INIT_PAINTING_CONSTANTS_R "]!" NL
119 ".allChannelMainLoop:" NL
121 // Initialize the sum variable.
122 "vmov.u32 " SUM_Q ", #0" NL
123 "mov " INIT_SUM_R ", " SOURCE_R NL
124 "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
125 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
126 "bcs .allChannelInitSumDone" NL
127 ".allChannelInitSum:" NL
128 "vld1.u32 " PIXEL_D00 ", [" INIT_SUM_R "], " STRIDE_R NL
129 "vmovl.u8 " PIXEL_Q ", " PIXEL_D0 NL
130 "vmovl.u16 " PIXEL_Q ", " PIXEL_D0 NL
131 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
132 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
133 "bcc .allChannelInitSum" NL
134 ".allChannelInitSumDone:" NL
137 "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
138 "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
139 "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
140 "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
142 ".allChannelBlur:" NL
143 "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
144 "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
145 "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
146 "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_NEON_ARM_Q NL
147 "vst1.u32 " PIXEL_D00 ", [" DESTINATION_R "], " STRIDE_R NL
149 "cmp " LEFT_R ", " SOURCE_R NL
150 "bcc .allChannelSkipLeft" NL
151 "vld1.u32 " PIXEL_D00 ", [" LEFT_R "]" NL
152 "vmovl.u8 " PIXEL_Q ", " PIXEL_D0 NL
153 "vmovl.u16 " PIXEL_Q ", " PIXEL_D0 NL
154 "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
155 ".allChannelSkipLeft: " NL
157 "cmp " RIGHT_R ", " SOURCE_END_R NL
158 "bcs .allChannelSkipRight" NL
159 "vld1.u32 " PIXEL_D00 ", [" RIGHT_R "]" NL
160 "vmovl.u8 " PIXEL_Q ", " PIXEL_D0 NL
161 "vmovl.u16 " PIXEL_Q ", " PIXEL_D0 NL
162 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
163 ".allChannelSkipRight: " NL
165 "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
166 "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
167 "cmp " DESTINATION_R ", " DESTINATION_END_R NL
168 "bcc .allChannelBlur" NL
169 "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
171 "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R NL
172 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R NL
173 "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
174 "bcc .allChannelMainLoop" NL
176 "ldmia sp!, {r4-r8, r10, r11, pc}" NL
179 #define DATA_TRANSFER4(command, base) \
180 command " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
181 command " " PIXEL_D01 ", [" base "], " STRIDE_LINE_R NL \
182 command " " PIXEL_D10 ", [" base "], " STRIDE_LINE_R NL \
183 command " " PIXEL_D11 ", [" base "], " STRIDE_LINE_R NL \
184 "sub " base ", " base ", " STRIDE_LINE_R ", lsl #2" NL
186 // The number of reads depend on REMAINING_STRIDES_R, but it is always >= 1 and <= 3
187 #define CONDITIONAL_DATA_TRANSFER4(command1, command2, base) \
188 command1 " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
189 "cmp " REMAINING_STRIDES_R ", #2" NL \
190 command2 "cs " PIXEL_S1 ", [" base "]" NL \
191 "add " base ", " base ", " STRIDE_LINE_R NL \
192 "cmp " REMAINING_STRIDES_R ", #3" NL \
193 command2 "cs " PIXEL_S2 ", [" base "]" NL \
194 "sub " base ", " base ", " STRIDE_LINE_R ", lsl #1" NL
197 ".globl " TOSTRING(neonDrawAlphaChannelGaussianBlur) NL
198 TOSTRING(neonDrawAlphaChannelGaussianBlur) ":" NL
199 "stmdb sp!, {r4-r8, r10, r11, lr}" NL
200 "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
201 "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
202 "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
203 "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
204 "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
205 "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
206 "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
207 "vldr.u32 " REMAINING_STRIDES_S0 ", [r2, #" REMAINING_STRIDES_OFFSET "]" NL
209 // Initialize locals.
210 "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
211 "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
212 "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
213 "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
214 "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
215 "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
216 "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
217 "cmp " SOURCE_LINE_END_R ", " SOURCE_R NL
218 "beq .alphaChannelEarlyLeave" NL
220 // Processing 4 strides parallelly.
222 ".alphaChannelMainLoop:" NL
224 // Initialize the sum variable.
225 "vmov.u32 " SUM_Q ", #0" NL
226 "mov " INIT_SUM_R ", " SOURCE_R NL
227 "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
228 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
229 "bcs .alphaChannelInitSumDone" NL
230 ".alphaChannelInitSum:" NL
231 DATA_TRANSFER4("vld1.u32", INIT_SUM_R)
232 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
233 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
234 "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
235 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
236 "bcc .alphaChannelInitSum" NL
237 ".alphaChannelInitSumDone:" NL
240 "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
241 "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
242 "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
243 "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
245 ".alphaChannelBlur:" NL
246 "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
247 "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
248 "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
249 "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
250 DATA_TRANSFER4("vst1.u32", DESTINATION_R)
252 "cmp " LEFT_R ", " SOURCE_R NL
253 "bcc .alphaChannelSkipLeft" NL
254 DATA_TRANSFER4("vld1.u32", LEFT_R)
255 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
256 "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
257 ".alphaChannelSkipLeft: " NL
259 "cmp " RIGHT_R ", " SOURCE_END_R NL
260 "bcs .alphaChannelSkipRight" NL
261 DATA_TRANSFER4("vld1.u32", RIGHT_R)
262 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
263 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
264 ".alphaChannelSkipRight: " NL
266 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
267 "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
268 "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
269 "cmp " DESTINATION_R ", " DESTINATION_END_R NL
270 "bcc .alphaChannelBlur" NL
271 "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
273 "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R ", lsl #2" NL
274 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R ", lsl #2" NL
275 "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
276 "bcc .alphaChannelMainLoop" NL
278 // Processing the remaining strides (0 - 3).
279 ".alphaChannelEarlyLeave:" NL
280 "vmov.u32 " REMAINING_STRIDES_R ", " REMAINING_STRIDES_S0 NL
281 // Early return for 0 strides.
282 "cmp " REMAINING_STRIDES_R ", #0" NL
283 "ldmeqia sp!, {r4-r8, r10, r11, pc}" NL
285 // Initialize the sum variable.
286 "vmov.u32 " SUM_Q ", #0" NL
287 "mov " INIT_SUM_R ", " SOURCE_R NL
288 "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
289 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
290 "bcs .alphaChannelSecondInitSumDone" NL
291 ".alphaChannelSecondInitSum:" NL
292 CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", INIT_SUM_R)
293 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
294 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
295 "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
296 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
297 "bcc .alphaChannelSecondInitSum" NL
298 ".alphaChannelSecondInitSumDone:" NL
301 "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
302 "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
303 "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
304 "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
306 ".alphaChannelSecondBlur:" NL
307 "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
308 "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
309 "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
310 "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
311 CONDITIONAL_DATA_TRANSFER4("vst1.u32", "vstr", DESTINATION_R)
313 "cmp " LEFT_R ", " SOURCE_R NL
314 "bcc .alphaChannelSecondSkipLeft" NL
315 CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LEFT_R)
316 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
317 "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
318 ".alphaChannelSecondSkipLeft: " NL
320 "cmp " RIGHT_R ", " SOURCE_END_R NL
321 "bcs .alphaChannelSecondSkipRight" NL
322 CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", RIGHT_R)
323 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
324 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
325 ".alphaChannelSecondSkipRight: " NL
327 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
328 "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
329 "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
330 "cmp " DESTINATION_R ", " DESTINATION_END_R NL
331 "bcc .alphaChannelSecondBlur" NL
333 "ldmia sp!, {r4-r8, r10, r11, pc}" NL
336 } // namespace WebCore
338 #endif // CPU(ARM_NEON) && COMPILER(GCC)