src/third_party/opus/src/celt/arm/celt_pitch_xcorr_arm.s

   1 ; Copyright (c) 2007-2008 CSIRO
   2 ; Copyright (c) 2007-2009 Xiph.Org Foundation
   3 ; Copyright (c) 2013      Parrot
   4 ; Written by Aurélien Zanelli
   5 ;
   6 ; Redistribution and use in source and binary forms, with or without
   7 ; modification, are permitted provided that the following conditions
   8 ; are met:
   9 ;
  10 ; - Redistributions of source code must retain the above copyright
  11 ; notice, this list of conditions and the following disclaimer.
  12 ;
  13 ; - Redistributions in binary form must reproduce the above copyright
  14 ; notice, this list of conditions and the following disclaimer in the
  15 ; documentation and/or other materials provided with the distribution.
  16 ;
  17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
  21 ; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  22 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  23 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  24 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  25 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  26 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  27 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28
  29   AREA  |.text|, CODE, READONLY
  30
  31   GET    celt/arm/armopts.s
  32
  33 IF OPUS_ARM_MAY_HAVE_EDSP
  34   EXPORT celt_pitch_xcorr_edsp
  35 ENDIF
  36
  37 IF OPUS_ARM_MAY_HAVE_NEON
  38   EXPORT celt_pitch_xcorr_neon
  39 ENDIF
  40
  41 IF OPUS_ARM_MAY_HAVE_NEON
  42
  43 ; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3
  44 xcorr_kernel_neon PROC
  45   ; input:
  46   ;   r3     = int         len
  47   ;   r4     = opus_val16 *x
  48   ;   r5     = opus_val16 *y
  49   ;   q0     = opus_val32  sum[4]
  50   ; output:
  51   ;   q0     = opus_val32  sum[4]
  52   ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15
  53   ; internal usage:
  54   ;   r12 = int j
  55   ;   d3  = y_3|y_2|y_1|y_0
  56   ;   q2  = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4
  57   ;   q3  = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0
  58   ;   q8  = scratch
  59   ;
  60   ; Load y[0...3]
  61   ; This requires len>0 to always be valid (which we assert in the C code).
  62   VLD1.16      {d5}, [r5]!
  63   SUBS         r12, r3, #8
  64   BLE xcorr_kernel_neon_process4
  65 ; Process 8 samples at a time.
  66 ; This loop loads one y value more than we actually need. Therefore we have to
  67 ; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid
  68 ; reading past the end of the array.
  69 xcorr_kernel_neon_process8
  70   ; This loop has 19 total instructions (10 cycles to issue, minimum), with
  71   ; - 2 cycles of ARM insrtuctions,
  72   ; - 10 cycles of load/store/byte permute instructions, and
  73   ; - 9 cycles of data processing instructions.
  74   ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the
  75   ; latter two categories, meaning the whole loop should run in 10 cycles per
  76   ; iteration, barring cache misses.
  77   ;
  78   ; Load x[0...7]
  79   VLD1.16      {d6, d7}, [r4]!
  80   ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get
  81   ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1.
  82   VAND         d3, d5, d5
  83   SUBS         r12, r12, #8
  84   ; Load y[4...11]
  85   VLD1.16      {d4, d5}, [r5]!
  86   VMLAL.S16    q0, d3, d6[0]
  87   VEXT.16      d16, d3, d4, #1
  88   VMLAL.S16    q0, d4, d7[0]
  89   VEXT.16      d17, d4, d5, #1
  90   VMLAL.S16    q0, d16, d6[1]
  91   VEXT.16      d16, d3, d4, #2
  92   VMLAL.S16    q0, d17, d7[1]
  93   VEXT.16      d17, d4, d5, #2
  94   VMLAL.S16    q0, d16, d6[2]
  95   VEXT.16      d16, d3, d4, #3
  96   VMLAL.S16    q0, d17, d7[2]
  97   VEXT.16      d17, d4, d5, #3
  98   VMLAL.S16    q0, d16, d6[3]
  99   VMLAL.S16    q0, d17, d7[3]
 100   BGT xcorr_kernel_neon_process8
 101 ; Process 4 samples here if we have > 4 left (still reading one extra y value).
 102 xcorr_kernel_neon_process4
 103   ADDS         r12, r12, #4
 104   BLE xcorr_kernel_neon_process2
 105   ; Load x[0...3]
 106   VLD1.16      d6, [r4]!
 107   ; Use VAND since it's a data processing instruction again.
 108   VAND         d4, d5, d5
 109   SUB          r12, r12, #4
 110   ; Load y[4...7]
 111   VLD1.16      d5, [r5]!
 112   VMLAL.S16    q0, d4, d6[0]
 113   VEXT.16      d16, d4, d5, #1
 114   VMLAL.S16    q0, d16, d6[1]
 115   VEXT.16      d16, d4, d5, #2
 116   VMLAL.S16    q0, d16, d6[2]
 117   VEXT.16      d16, d4, d5, #3
 118   VMLAL.S16    q0, d16, d6[3]
 119 ; Process 2 samples here if we have > 2 left (still reading one extra y value).
 120 xcorr_kernel_neon_process2
 121   ADDS         r12, r12, #2
 122   BLE xcorr_kernel_neon_process1
 123   ; Load x[0...1]
 124   VLD2.16      {d6[],d7[]}, [r4]!
 125   ; Use VAND since it's a data processing instruction again.
 126   VAND         d4, d5, d5
 127   SUB          r12, r12, #2
 128   ; Load y[4...5]
 129   VLD1.32      {d5[]}, [r5]!
 130   VMLAL.S16    q0, d4, d6
 131   VEXT.16      d16, d4, d5, #1
 132   ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI
 133   ; instead of VEXT, since it's a data-processing instruction.
 134   VSRI.64      d5, d4, #32
 135   VMLAL.S16    q0, d16, d7
 136 ; Process 1 sample using the extra y value we loaded above.
 137 xcorr_kernel_neon_process1
 138   ; Load next *x
 139   VLD1.16      {d6[]}, [r4]!
 140   ADDS         r12, r12, #1
 141   ; y[0...3] are left in d5 from prior iteration(s) (if any)
 142   VMLAL.S16    q0, d5, d6
 143   MOVLE        pc, lr
 144 ; Now process 1 last sample, not reading ahead.
 145   ; Load last *y
 146   VLD1.16      {d4[]}, [r5]!
 147   VSRI.64      d4, d5, #16
 148   ; Load last *x
 149   VLD1.16      {d6[]}, [r4]!
 150   VMLAL.S16    q0, d4, d6
 151   MOV          pc, lr
 152   ENDP
 153
 154 ; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y,
 155 ;  opus_val32 *xcorr, int len, int max_pitch)
 156 celt_pitch_xcorr_neon PROC
 157   ; input:
 158   ;   r0  = opus_val16 *_x
 159   ;   r1  = opus_val16 *_y
 160   ;   r2  = opus_val32 *xcorr
 161   ;   r3  = int         len
 162   ; output:
 163   ;   r0  = int         maxcorr
 164   ; internal usage:
 165   ;   r4  = opus_val16 *x (for xcorr_kernel_neon())
 166   ;   r5  = opus_val16 *y (for xcorr_kernel_neon())
 167   ;   r6  = int         max_pitch
 168   ;   r12 = int         j
 169   ;   q15 = int         maxcorr[4] (q15 is not used by xcorr_kernel_neon())
 170   STMFD        sp!, {r4-r6, lr}
 171   LDR          r6, [sp, #16]
 172   VMOV.S32     q15, #1
 173   ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
 174   SUBS         r6, r6, #4
 175   BLT celt_pitch_xcorr_neon_process4_done
 176 celt_pitch_xcorr_neon_process4
 177   ; xcorr_kernel_neon parameters:
 178   ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0}
 179   MOV          r4, r0
 180   MOV          r5, r1
 181   VEOR         q0, q0, q0
 182   ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3.
 183   ; So we don't save/restore any other registers.
 184   BL xcorr_kernel_neon
 185   SUBS         r6, r6, #4
 186   VST1.32      {q0}, [r2]!
 187   ; _y += 4
 188   ADD          r1, r1, #8
 189   VMAX.S32     q15, q15, q0
 190   ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done
 191   BGE celt_pitch_xcorr_neon_process4
 192 ; We have less than 4 sums left to compute.
 193 celt_pitch_xcorr_neon_process4_done
 194   ADDS         r6, r6, #4
 195   ; Reduce maxcorr to a single value
 196   VMAX.S32     d30, d30, d31
 197   VPMAX.S32    d30, d30, d30
 198   ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done
 199   BLE celt_pitch_xcorr_neon_done
 200 ; Now compute each remaining sum one at a time.
 201 celt_pitch_xcorr_neon_process_remaining
 202   MOV          r4, r0
 203   MOV          r5, r1
 204   VMOV.I32     q0, #0
 205   SUBS         r12, r3, #8
 206   BLT celt_pitch_xcorr_neon_process_remaining4
 207 ; Sum terms 8 at a time.
 208 celt_pitch_xcorr_neon_process_remaining_loop8
 209   ; Load x[0...7]
 210   VLD1.16      {q1}, [r4]!
 211   ; Load y[0...7]
 212   VLD1.16      {q2}, [r5]!
 213   SUBS         r12, r12, #8
 214   VMLAL.S16    q0, d4, d2
 215   VMLAL.S16    q0, d5, d3
 216   BGE celt_pitch_xcorr_neon_process_remaining_loop8
 217 ; Sum terms 4 at a time.
 218 celt_pitch_xcorr_neon_process_remaining4
 219   ADDS         r12, r12, #4
 220   BLT celt_pitch_xcorr_neon_process_remaining4_done
 221   ; Load x[0...3]
 222   VLD1.16      {d2}, [r4]!
 223   ; Load y[0...3]
 224   VLD1.16      {d3}, [r5]!
 225   SUB          r12, r12, #4
 226   VMLAL.S16    q0, d3, d2
 227 celt_pitch_xcorr_neon_process_remaining4_done
 228   ; Reduce the sum to a single value.
 229   VADD.S32     d0, d0, d1
 230   VPADDL.S32   d0, d0
 231   ADDS         r12, r12, #4
 232   BLE celt_pitch_xcorr_neon_process_remaining_loop_done
 233 ; Sum terms 1 at a time.
 234 celt_pitch_xcorr_neon_process_remaining_loop1
 235   VLD1.16      {d2[]}, [r4]!
 236   VLD1.16      {d3[]}, [r5]!
 237   SUBS         r12, r12, #1
 238   VMLAL.S16    q0, d2, d3
 239   BGT celt_pitch_xcorr_neon_process_remaining_loop1
 240 celt_pitch_xcorr_neon_process_remaining_loop_done
 241   VST1.32      {d0[0]}, [r2]!
 242   VMAX.S32     d30, d30, d0
 243   SUBS         r6, r6, #1
 244   ; _y++
 245   ADD          r1, r1, #2
 246   ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining
 247   BGT celt_pitch_xcorr_neon_process_remaining
 248 celt_pitch_xcorr_neon_done
 249   VMOV.32      r0, d30[0]
 250   LDMFD        sp!, {r4-r6, pc}
 251   ENDP
 252
 253 ENDIF
 254
 255 IF OPUS_ARM_MAY_HAVE_EDSP
 256
 257 ; This will get used on ARMv7 devices without NEON, so it has been optimized
 258 ; to take advantage of dual-issuing where possible.
 259 xcorr_kernel_edsp PROC
 260   ; input:
 261   ;   r3      = int         len
 262   ;   r4      = opus_val16 *_x (must be 32-bit aligned)
 263   ;   r5      = opus_val16 *_y (must be 32-bit aligned)
 264   ;   r6...r9 = opus_val32  sum[4]
 265   ; output:
 266   ;   r6...r9 = opus_val32  sum[4]
 267   ; preserved: r0-r5
 268   ; internal usage
 269   ;   r2      = int         j
 270   ;   r12,r14 = opus_val16  x[4]
 271   ;   r10,r11 = opus_val16  y[4]
 272   STMFD        sp!, {r2,r4,r5,lr}
 273   LDR          r10, [r5], #4      ; Load y[0...1]
 274   SUBS         r2, r3, #4         ; j = len-4
 275   LDR          r11, [r5], #4      ; Load y[2...3]
 276   BLE xcorr_kernel_edsp_process4_done
 277   LDR          r12, [r4], #4      ; Load x[0...1]
 278   ; Stall
 279 xcorr_kernel_edsp_process4
 280   ; The multiplies must issue from pipeline 0, and can't dual-issue with each
 281   ; other. Every other instruction here dual-issues with a multiply, and is
 282   ; thus "free". There should be no stalls in the body of the loop.
 283   SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_0,y_0)
 284   LDR          r14, [r4], #4      ; Load x[2...3]
 285   SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x_0,y_1)
 286   SUBS         r2, r2, #4         ; j-=4
 287   SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_0,y_2)
 288   SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x_0,y_3)
 289   SMLATT       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x_1,y_1)
 290   LDR          r10, [r5], #4      ; Load y[4...5]
 291   SMLATB       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],x_1,y_2)
 292   SMLATT       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x_1,y_3)
 293   SMLATB       r9, r12, r10, r9   ; sum[3] = MAC16_16(sum[3],x_1,y_4)
 294   LDRGT        r12, [r4], #4      ; Load x[0...1]
 295   SMLABB       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_2,y_2)
 296   SMLABT       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x_2,y_3)
 297   SMLABB       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_2,y_4)
 298   SMLABT       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x_2,y_5)
 299   SMLATT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],x_3,y_3)
 300   LDR          r11, [r5], #4      ; Load y[6...7]
 301   SMLATB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],x_3,y_4)
 302   SMLATT       r8, r14, r10, r8   ; sum[2] = MAC16_16(sum[2],x_3,y_5)
 303   SMLATB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],x_3,y_6)
 304   BGT xcorr_kernel_edsp_process4
 305 xcorr_kernel_edsp_process4_done
 306   ADDS         r2, r2, #4
 307   BLE xcorr_kernel_edsp_done
 308   LDRH         r12, [r4], #2      ; r12 = *x++
 309   SUBS         r2, r2, #1         ; j--
 310   ; Stall
 311   SMLABB       r6, r12, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_0)
 312   LDRHGT       r14, [r4], #2      ; r14 = *x++
 313   SMLABT       r7, r12, r10, r7   ; sum[1] = MAC16_16(sum[1],x,y_1)
 314   SMLABB       r8, r12, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_2)
 315   SMLABT       r9, r12, r11, r9   ; sum[3] = MAC16_16(sum[3],x,y_3)
 316   BLE xcorr_kernel_edsp_done
 317   SMLABT       r6, r14, r10, r6   ; sum[0] = MAC16_16(sum[0],x,y_1)
 318   SUBS         r2, r2, #1         ; j--
 319   SMLABB       r7, r14, r11, r7   ; sum[1] = MAC16_16(sum[1],x,y_2)
 320   LDRH         r10, [r5], #2      ; r10 = y_4 = *y++
 321   SMLABT       r8, r14, r11, r8   ; sum[2] = MAC16_16(sum[2],x,y_3)
 322   LDRHGT       r12, [r4], #2      ; r12 = *x++
 323   SMLABB       r9, r14, r10, r9   ; sum[3] = MAC16_16(sum[3],x,y_4)
 324   BLE xcorr_kernel_edsp_done
 325   SMLABB       r6, r12, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_2)
 326   CMP          r2, #1             ; j--
 327   SMLABT       r7, r12, r11, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_3)
 328   LDRH         r2, [r5], #2       ; r2 = y_5 = *y++
 329   SMLABB       r8, r12, r10, r8   ; sum[2] = MAC16_16(sum[2],tmp,y_4)
 330   LDRHGT       r14, [r4]          ; r14 = *x
 331   SMLABB       r9, r12, r2, r9    ; sum[3] = MAC16_16(sum[3],tmp,y_5)
 332   BLE xcorr_kernel_edsp_done
 333   SMLABT       r6, r14, r11, r6   ; sum[0] = MAC16_16(sum[0],tmp,y_3)
 334   LDRH         r11, [r5]          ; r11 = y_6 = *y
 335   SMLABB       r7, r14, r10, r7   ; sum[1] = MAC16_16(sum[1],tmp,y_4)
 336   SMLABB       r8, r14, r2, r8    ; sum[2] = MAC16_16(sum[2],tmp,y_5)
 337   SMLABB       r9, r14, r11, r9   ; sum[3] = MAC16_16(sum[3],tmp,y_6)
 338 xcorr_kernel_edsp_done
 339   LDMFD        sp!, {r2,r4,r5,pc}
 340   ENDP
 341
 342 celt_pitch_xcorr_edsp PROC
 343   ; input:
 344   ;   r0  = opus_val16 *_x (must be 32-bit aligned)
 345   ;   r1  = opus_val16 *_y (only needs to be 16-bit aligned)
 346   ;   r2  = opus_val32 *xcorr
 347   ;   r3  = int         len
 348   ; output:
 349   ;   r0  = maxcorr
 350   ; internal usage
 351   ;   r4  = opus_val16 *x
 352   ;   r5  = opus_val16 *y
 353   ;   r6  = opus_val32  sum0
 354   ;   r7  = opus_val32  sum1
 355   ;   r8  = opus_val32  sum2
 356   ;   r9  = opus_val32  sum3
 357   ;   r1  = int         max_pitch
 358   ;   r12 = int         j
 359   STMFD        sp!, {r4-r11, lr}
 360   MOV          r5, r1
 361   LDR          r1, [sp, #36]
 362   MOV          r4, r0
 363   TST          r5, #3
 364   ; maxcorr = 1
 365   MOV          r0, #1
 366   BEQ          celt_pitch_xcorr_edsp_process1u_done
 367 ; Compute one sum at the start to make y 32-bit aligned.
 368   SUBS         r12, r3, #4
 369   ; r14 = sum = 0
 370   MOV          r14, #0
 371   LDRH         r8, [r5], #2
 372   BLE celt_pitch_xcorr_edsp_process1u_loop4_done
 373   LDR          r6, [r4], #4
 374   MOV          r8, r8, LSL #16
 375 celt_pitch_xcorr_edsp_process1u_loop4
 376   LDR          r9, [r5], #4
 377   SMLABT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
 378   LDR          r7, [r4], #4
 379   SMLATB       r14, r6, r9, r14     ; sum = MAC16_16(sum, x_1, y_1)
 380   LDR          r8, [r5], #4
 381   SMLABT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
 382   SUBS         r12, r12, #4         ; j-=4
 383   SMLATB       r14, r7, r8, r14     ; sum = MAC16_16(sum, x_3, y_3)
 384   LDRGT        r6, [r4], #4
 385   BGT celt_pitch_xcorr_edsp_process1u_loop4
 386   MOV          r8, r8, LSR #16
 387 celt_pitch_xcorr_edsp_process1u_loop4_done
 388   ADDS         r12, r12, #4
 389 celt_pitch_xcorr_edsp_process1u_loop1
 390   LDRHGE       r6, [r4], #2
 391   ; Stall
 392   SMLABBGE     r14, r6, r8, r14    ; sum = MAC16_16(sum, *x, *y)
 393   SUBSGE       r12, r12, #1
 394   LDRHGT       r8, [r5], #2
 395   BGT celt_pitch_xcorr_edsp_process1u_loop1
 396   ; Restore _x
 397   SUB          r4, r4, r3, LSL #1
 398   ; Restore and advance _y
 399   SUB          r5, r5, r3, LSL #1
 400   ; maxcorr = max(maxcorr, sum)
 401   CMP          r0, r14
 402   ADD          r5, r5, #2
 403   MOVLT        r0, r14
 404   SUBS         r1, r1, #1
 405   ; xcorr[i] = sum
 406   STR          r14, [r2], #4
 407   BLE celt_pitch_xcorr_edsp_done
 408 celt_pitch_xcorr_edsp_process1u_done
 409   ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process2
 410   SUBS         r1, r1, #4
 411   BLT celt_pitch_xcorr_edsp_process2
 412 celt_pitch_xcorr_edsp_process4
 413   ; xcorr_kernel_edsp parameters:
 414   ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0}
 415   MOV          r6, #0
 416   MOV          r7, #0
 417   MOV          r8, #0
 418   MOV          r9, #0
 419   BL xcorr_kernel_edsp  ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len)
 420   ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3)
 421   CMP          r0, r6
 422   ; _y+=4
 423   ADD          r5, r5, #8
 424   MOVLT        r0, r6
 425   CMP          r0, r7
 426   MOVLT        r0, r7
 427   CMP          r0, r8
 428   MOVLT        r0, r8
 429   CMP          r0, r9
 430   MOVLT        r0, r9
 431   STMIA        r2!, {r6-r9}
 432   SUBS         r1, r1, #4
 433   BGE celt_pitch_xcorr_edsp_process4
 434 celt_pitch_xcorr_edsp_process2
 435   ADDS         r1, r1, #2
 436   BLT celt_pitch_xcorr_edsp_process1a
 437   SUBS         r12, r3, #4
 438   ; {r10, r11} = {sum0, sum1} = {0, 0}
 439   MOV          r10, #0
 440   MOV          r11, #0
 441   LDR          r8, [r5], #4
 442   BLE celt_pitch_xcorr_edsp_process2_loop_done
 443   LDR          r6, [r4], #4
 444   LDR          r9, [r5], #4
 445 celt_pitch_xcorr_edsp_process2_loop4
 446   SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
 447   LDR          r7, [r4], #4
 448   SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
 449   SUBS         r12, r12, #4         ; j-=4
 450   SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
 451   LDR          r8, [r5], #4
 452   SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
 453   LDRGT        r6, [r4], #4
 454   SMLABB       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_2, y_2)
 455   SMLABT       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_2, y_3)
 456   SMLATT       r10, r7, r9, r10     ; sum0 = MAC16_16(sum0, x_3, y_3)
 457   LDRGT        r9, [r5], #4
 458   SMLATB       r11, r7, r8, r11     ; sum1 = MAC16_16(sum1, x_3, y_4)
 459   BGT celt_pitch_xcorr_edsp_process2_loop4
 460 celt_pitch_xcorr_edsp_process2_loop_done
 461   ADDS         r12, r12, #2
 462   BLE  celt_pitch_xcorr_edsp_process2_1
 463   LDR          r6, [r4], #4
 464   ; Stall
 465   SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
 466   LDR          r9, [r5], #4
 467   SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
 468   SUB          r12, r12, #2
 469   SMLATT       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_1, y_1)
 470   MOV          r8, r9
 471   SMLATB       r11, r6, r9, r11     ; sum1 = MAC16_16(sum1, x_1, y_2)
 472 celt_pitch_xcorr_edsp_process2_1
 473   LDRH         r6, [r4], #2
 474   ADDS         r12, r12, #1
 475   ; Stall
 476   SMLABB       r10, r6, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_0)
 477   LDRHGT       r7, [r4], #2
 478   SMLABT       r11, r6, r8, r11     ; sum1 = MAC16_16(sum1, x_0, y_1)
 479   BLE celt_pitch_xcorr_edsp_process2_done
 480   LDRH         r9, [r5], #2
 481   SMLABT       r10, r7, r8, r10     ; sum0 = MAC16_16(sum0, x_0, y_1)
 482   SMLABB       r11, r7, r9, r11     ; sum1 = MAC16_16(sum1, x_0, y_2)
 483 celt_pitch_xcorr_edsp_process2_done
 484   ; Restore _x
 485   SUB          r4, r4, r3, LSL #1
 486   ; Restore and advance _y
 487   SUB          r5, r5, r3, LSL #1
 488   ; maxcorr = max(maxcorr, sum0)
 489   CMP          r0, r10
 490   ADD          r5, r5, #2
 491   MOVLT        r0, r10
 492   SUB          r1, r1, #2
 493   ; maxcorr = max(maxcorr, sum1)
 494   CMP          r0, r11
 495   ; xcorr[i] = sum
 496   STR          r10, [r2], #4
 497   MOVLT        r0, r11
 498   STR          r11, [r2], #4
 499 celt_pitch_xcorr_edsp_process1a
 500   ADDS         r1, r1, #1
 501   BLT celt_pitch_xcorr_edsp_done
 502   SUBS         r12, r3, #4
 503   ; r14 = sum = 0
 504   MOV          r14, #0
 505   BLT celt_pitch_xcorr_edsp_process1a_loop_done
 506   LDR          r6, [r4], #4
 507   LDR          r8, [r5], #4
 508   LDR          r7, [r4], #4
 509   LDR          r9, [r5], #4
 510 celt_pitch_xcorr_edsp_process1a_loop4
 511   SMLABB       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
 512   SUBS         r12, r12, #4         ; j-=4
 513   SMLATT       r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
 514   LDRGE        r6, [r4], #4
 515   SMLABB       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_2, y_2)
 516   LDRGE        r8, [r5], #4
 517   SMLATT       r14, r7, r9, r14     ; sum = MAC16_16(sum, x_3, y_3)
 518   LDRGE        r7, [r4], #4
 519   LDRGE        r9, [r5], #4
 520   BGE celt_pitch_xcorr_edsp_process1a_loop4
 521 celt_pitch_xcorr_edsp_process1a_loop_done
 522   ADDS         r12, r12, #2
 523   LDRGE        r6, [r4], #4
 524   LDRGE        r8, [r5], #4
 525   ; Stall
 526   SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_0, y_0)
 527   SUBGE        r12, r12, #2
 528   SMLATTGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, x_1, y_1)
 529   ADDS         r12, r12, #1
 530   LDRHGE       r6, [r4], #2
 531   LDRHGE       r8, [r5], #2
 532   ; Stall
 533   SMLABBGE     r14, r6, r8, r14     ; sum = MAC16_16(sum, *x, *y)
 534   ; maxcorr = max(maxcorr, sum)
 535   CMP          r0, r14
 536   ; xcorr[i] = sum
 537   STR          r14, [r2], #4
 538   MOVLT        r0, r14
 539 celt_pitch_xcorr_edsp_done
 540   LDMFD        sp!, {r4-r11, pc}
 541   ENDP
 542
 543 ENDIF
 544
 545 END