1 ; Copyright (c) 2007-2008 CSIRO
2 ; Copyright (c) 2007-2009 Xiph.Org Foundation
3 ; Copyright (c) 2013 Parrot
4 ; Written by Aurélien Zanelli
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions
10 ; - Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; - Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the distribution.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 AREA |.text|, CODE, READONLY
31 GET celt/arm/armopts.s
33 IF OPUS_ARM_MAY_HAVE_EDSP
34 EXPORT celt_pitch_xcorr_edsp
37 IF OPUS_ARM_MAY_HAVE_NEON
38 EXPORT celt_pitch_xcorr_neon
41 IF OPUS_ARM_MAY_HAVE_NEON
43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
44 xcorr_kernel_neon PROC
49 ; q0 = opus_val32 sum[4]
51 ; q0 = opus_val32 sum[4]
52 ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
55 ; d3 = y_3|y_2|y_1|y_0
56 ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
57 ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
61 ; This requires len>0 to always be valid (which we assert in the C code).
64 BLE xcorr_kernel_neon_process4
65 ; Process 8 samples at a time.
66 ; This loop loads one y value more than we actually need. Therefore we have to
67 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
68 ; reading past the end of the array.
69 xcorr_kernel_neon_process8
70 ; This loop has 19 total instructions (10 cycles to issue, minimum), with
71 ; - 2 cycles of ARM insrtuctions,
72 ; - 10 cycles of load/store/byte permute instructions, and
73 ; - 9 cycles of data processing instructions.
74 ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
75 ; latter two categories, meaning the whole loop should run in 10 cycles per
76 ; iteration, barring cache misses.
79 VLD1.16 {d6, d7}, [r4]!
80 ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
81 ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
85 VLD1.16 {d4, d5}, [r5]!
86 VMLAL.S16 q0, d3, d6[0]
87 VEXT.16 d16, d3, d4, #1
88 VMLAL.S16 q0, d4, d7[0]
89 VEXT.16 d17, d4, d5, #1
90 VMLAL.S16 q0, d16, d6[1]
91 VEXT.16 d16, d3, d4, #2
92 VMLAL.S16 q0, d17, d7[1]
93 VEXT.16 d17, d4, d5, #2
94 VMLAL.S16 q0, d16, d6[2]
95 VEXT.16 d16, d3, d4, #3
96 VMLAL.S16 q0, d17, d7[2]
97 VEXT.16 d17, d4, d5, #3
98 VMLAL.S16 q0, d16, d6[3]
99 VMLAL.S16 q0, d17, d7[3]
100 BGT xcorr_kernel_neon_process8
101 ; Process 4 samples here if we have > 4 left (still reading one extra y value).
102 xcorr_kernel_neon_process4
104 BLE xcorr_kernel_neon_process2
107 ; Use VAND since it's a data processing instruction again.
112 VMLAL.S16 q0, d4, d6[0]
113 VEXT.16 d16, d4, d5, #1
114 VMLAL.S16 q0, d16, d6[1]
115 VEXT.16 d16, d4, d5, #2
116 VMLAL.S16 q0, d16, d6[2]
117 VEXT.16 d16, d4, d5, #3
118 VMLAL.S16 q0, d16, d6[3]
119 ; Process 2 samples here if we have > 2 left (still reading one extra y value).
120 xcorr_kernel_neon_process2
122 BLE xcorr_kernel_neon_process1
124 VLD2.16 {d6[],d7[]}, [r4]!
125 ; Use VAND since it's a data processing instruction again.
129 VLD1.32 {d5[]}, [r5]!
131 VEXT.16 d16, d4, d5, #1
132 ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
133 ; instead of VEXT, since it's a data-processing instruction.
135 VMLAL.S16 q0, d16, d7
136 ; Process 1 sample using the extra y value we loaded above.
137 xcorr_kernel_neon_process1
139 VLD1.16 {d6[]}, [r4]!
141 ; y[0...3] are left in d5 from prior iteration(s) (if any)
144 ; Now process 1 last sample, not reading ahead.
146 VLD1.16 {d4[]}, [r5]!
149 VLD1.16 {d6[]}, [r4]!
154 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
155 ; opus_val32 *xcorr, int len, int max_pitch)
156 celt_pitch_xcorr_neon PROC
158 ; r0 = opus_val16 *_x
159 ; r1 = opus_val16 *_y
160 ; r2 = opus_val32 *xcorr
165 ; r4 = opus_val16 *x (for xcorr_kernel_neon())
166 ; r5 = opus_val16 *y (for xcorr_kernel_neon())
169 ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon())
170 STMFD sp!, {r4-r6, lr}
173 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
175 BLT celt_pitch_xcorr_neon_process4_done
176 celt_pitch_xcorr_neon_process4
177 ; xcorr_kernel_neon parameters:
178 ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
182 ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
183 ; So we don't save/restore any other registers.
189 VMAX.S32 q15, q15, q0
190 ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
191 BGE celt_pitch_xcorr_neon_process4
192 ; We have less than 4 sums left to compute.
193 celt_pitch_xcorr_neon_process4_done
195 ; Reduce maxcorr to a single value
196 VMAX.S32 d30, d30, d31
197 VPMAX.S32 d30, d30, d30
198 ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
199 BLE celt_pitch_xcorr_neon_done
200 ; Now compute each remaining sum one at a time.
201 celt_pitch_xcorr_neon_process_remaining
206 BLT celt_pitch_xcorr_neon_process_remaining4
207 ; Sum terms 8 at a time.
208 celt_pitch_xcorr_neon_process_remaining_loop8
216 BGE celt_pitch_xcorr_neon_process_remaining_loop8
217 ; Sum terms 4 at a time.
218 celt_pitch_xcorr_neon_process_remaining4
220 BLT celt_pitch_xcorr_neon_process_remaining4_done
227 celt_pitch_xcorr_neon_process_remaining4_done
228 ; Reduce the sum to a single value.
232 BLE celt_pitch_xcorr_neon_process_remaining_loop_done
233 ; Sum terms 1 at a time.
234 celt_pitch_xcorr_neon_process_remaining_loop1
235 VLD1.16 {d2[]}, [r4]!
236 VLD1.16 {d3[]}, [r5]!
239 BGT celt_pitch_xcorr_neon_process_remaining_loop1
240 celt_pitch_xcorr_neon_process_remaining_loop_done
241 VST1.32 {d0[0]}, [r2]!
242 VMAX.S32 d30, d30, d0
246 ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
247 BGT celt_pitch_xcorr_neon_process_remaining
248 celt_pitch_xcorr_neon_done
250 LDMFD sp!, {r4-r6, pc}
255 IF OPUS_ARM_MAY_HAVE_EDSP
257 ; This will get used on ARMv7 devices without NEON, so it has been optimized
258 ; to take advantage of dual-issuing where possible.
259 xcorr_kernel_edsp PROC
262 ; r4 = opus_val16 *_x (must be 32-bit aligned)
263 ; r5 = opus_val16 *_y (must be 32-bit aligned)
264 ; r6...r9 = opus_val32 sum[4]
266 ; r6...r9 = opus_val32 sum[4]
270 ; r12,r14 = opus_val16 x[4]
271 ; r10,r11 = opus_val16 y[4]
272 STMFD sp!, {r2,r4,r5,lr}
273 LDR r10, [r5], #4 ; Load y[0...1]
274 SUBS r2, r3, #4 ; j = len-4
275 LDR r11, [r5], #4 ; Load y[2...3]
276 BLE xcorr_kernel_edsp_process4_done
277 LDR r12, [r4], #4 ; Load x[0...1]
279 xcorr_kernel_edsp_process4
280 ; The multiplies must issue from pipeline 0, and can't dual-issue with each
281 ; other. Every other instruction here dual-issues with a multiply, and is
282 ; thus "free". There should be no stalls in the body of the loop.
283 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0)
284 LDR r14, [r4], #4 ; Load x[2...3]
285 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1)
286 SUBS r2, r2, #4 ; j-=4
287 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2)
288 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3)
289 SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1)
290 LDR r10, [r5], #4 ; Load y[4...5]
291 SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2)
292 SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3)
293 SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4)
294 LDRGT r12, [r4], #4 ; Load x[0...1]
295 SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2)
296 SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3)
297 SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4)
298 SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5)
299 SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3)
300 LDR r11, [r5], #4 ; Load y[6...7]
301 SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4)
302 SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5)
303 SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6)
304 BGT xcorr_kernel_edsp_process4
305 xcorr_kernel_edsp_process4_done
307 BLE xcorr_kernel_edsp_done
308 LDRH r12, [r4], #2 ; r12 = *x++
309 SUBS r2, r2, #1 ; j--
311 SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0)
312 LDRHGT r14, [r4], #2 ; r14 = *x++
313 SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1)
314 SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2)
315 SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3)
316 BLE xcorr_kernel_edsp_done
317 SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1)
318 SUBS r2, r2, #1 ; j--
319 SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2)
320 LDRH r10, [r5], #2 ; r10 = y_4 = *y++
321 SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3)
322 LDRHGT r12, [r4], #2 ; r12 = *x++
323 SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4)
324 BLE xcorr_kernel_edsp_done
325 SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2)
327 SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3)
328 LDRH r2, [r5], #2 ; r2 = y_5 = *y++
329 SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4)
330 LDRHGT r14, [r4] ; r14 = *x
331 SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5)
332 BLE xcorr_kernel_edsp_done
333 SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3)
334 LDRH r11, [r5] ; r11 = y_6 = *y
335 SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4)
336 SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5)
337 SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6)
338 xcorr_kernel_edsp_done
339 LDMFD sp!, {r2,r4,r5,pc}
342 celt_pitch_xcorr_edsp PROC
344 ; r0 = opus_val16 *_x (must be 32-bit aligned)
345 ; r1 = opus_val16 *_y (only needs to be 16-bit aligned)
346 ; r2 = opus_val32 *xcorr
353 ; r6 = opus_val32 sum0
354 ; r7 = opus_val32 sum1
355 ; r8 = opus_val32 sum2
356 ; r9 = opus_val32 sum3
359 STMFD sp!, {r4-r11, lr}
366 BEQ celt_pitch_xcorr_edsp_process1u_done
367 ; Compute one sum at the start to make y 32-bit aligned.
372 BLE celt_pitch_xcorr_edsp_process1u_loop4_done
375 celt_pitch_xcorr_edsp_process1u_loop4
377 SMLABT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
379 SMLATB r14, r6, r9, r14 ; sum = MAC16_16(sum, x_1, y_1)
381 SMLABT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
382 SUBS r12, r12, #4 ; j-=4
383 SMLATB r14, r7, r8, r14 ; sum = MAC16_16(sum, x_3, y_3)
385 BGT celt_pitch_xcorr_edsp_process1u_loop4
387 celt_pitch_xcorr_edsp_process1u_loop4_done
389 celt_pitch_xcorr_edsp_process1u_loop1
392 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
395 BGT celt_pitch_xcorr_edsp_process1u_loop1
397 SUB r4, r4, r3, LSL #1
398 ; Restore and advance _y
399 SUB r5, r5, r3, LSL #1
400 ; maxcorr = max(maxcorr, sum)
407 BLE celt_pitch_xcorr_edsp_done
408 celt_pitch_xcorr_edsp_process1u_done
409 ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
411 BLT celt_pitch_xcorr_edsp_process2
412 celt_pitch_xcorr_edsp_process4
413 ; xcorr_kernel_edsp parameters:
414 ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
419 BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
420 ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
433 BGE celt_pitch_xcorr_edsp_process4
434 celt_pitch_xcorr_edsp_process2
436 BLT celt_pitch_xcorr_edsp_process1a
438 ; {r10, r11} = {sum0, sum1} = {0, 0}
442 BLE celt_pitch_xcorr_edsp_process2_loop_done
445 celt_pitch_xcorr_edsp_process2_loop4
446 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
448 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
449 SUBS r12, r12, #4 ; j-=4
450 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
452 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
454 SMLABB r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_2, y_2)
455 SMLABT r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_2, y_3)
456 SMLATT r10, r7, r9, r10 ; sum0 = MAC16_16(sum0, x_3, y_3)
458 SMLATB r11, r7, r8, r11 ; sum1 = MAC16_16(sum1, x_3, y_4)
459 BGT celt_pitch_xcorr_edsp_process2_loop4
460 celt_pitch_xcorr_edsp_process2_loop_done
462 BLE celt_pitch_xcorr_edsp_process2_1
465 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
467 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
469 SMLATT r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_1, y_1)
471 SMLATB r11, r6, r9, r11 ; sum1 = MAC16_16(sum1, x_1, y_2)
472 celt_pitch_xcorr_edsp_process2_1
476 SMLABB r10, r6, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_0)
478 SMLABT r11, r6, r8, r11 ; sum1 = MAC16_16(sum1, x_0, y_1)
479 BLE celt_pitch_xcorr_edsp_process2_done
481 SMLABT r10, r7, r8, r10 ; sum0 = MAC16_16(sum0, x_0, y_1)
482 SMLABB r11, r7, r9, r11 ; sum1 = MAC16_16(sum1, x_0, y_2)
483 celt_pitch_xcorr_edsp_process2_done
485 SUB r4, r4, r3, LSL #1
486 ; Restore and advance _y
487 SUB r5, r5, r3, LSL #1
488 ; maxcorr = max(maxcorr, sum0)
493 ; maxcorr = max(maxcorr, sum1)
499 celt_pitch_xcorr_edsp_process1a
501 BLT celt_pitch_xcorr_edsp_done
505 BLT celt_pitch_xcorr_edsp_process1a_loop_done
510 celt_pitch_xcorr_edsp_process1a_loop4
511 SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
512 SUBS r12, r12, #4 ; j-=4
513 SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
515 SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2)
517 SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3)
520 BGE celt_pitch_xcorr_edsp_process1a_loop4
521 celt_pitch_xcorr_edsp_process1a_loop_done
526 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0)
528 SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1)
533 SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y)
534 ; maxcorr = max(maxcorr, sum)
539 celt_pitch_xcorr_edsp_done
540 LDMFD sp!, {r4-r11, pc}