1 ; libFLAC - Free Lossless Audio Codec library
2 ; Copyright (C) 2004,2005,2006,2007 Josh Coalson
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
8 ; - Redistributions of source code must retain the above copyright
9 ; notice, this list of conditions and the following disclaimer.
11 ; - Redistributions in binary form must reproduce the above copyright
12 ; notice, this list of conditions and the following disclaimer in the
13 ; documentation and/or other materials provided with the distribution.
15 ; - Neither the name of the Xiph.org Foundation nor the names of its
16 ; contributors may be used to endorse or promote products derived from
17 ; this software without specific prior written permission.
19 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
23 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16
35 .globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8
37 _FLAC__lpc_restore_signal_asm_ppc_altivec_16:
45 ; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal()
46 ; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual
47 ; bps<=15 for mid-side coding, since that uses an extra bit)
49 ; these should be fast; the inner loop is unrolled (it takes no more than
50 ; 3*(order%4) instructions, all of which are arithmetic), and all of the
51 ; coefficients and all relevant history stay in registers, so the outer loop
52 ; has only one load from memory (the residual)
54 ; I have not yet run this through simg4, so there may be some avoidable stalls,
55 ; and there may be a somewhat more clever way to do the outer loop
57 ; the branch mechanism may prevent dynamic loading; I still need to examine
58 ; this issue, and there may be a more elegant method
64 andc r9,r9,r31 ; for quadword-aligned stack data
66 slwi r6,r6,2 ; adjust for word size
68 add r4,r4,r8 ; r4 = data+data_len
70 mfspr r0,256 ; cache old vrsave
71 addis r31,0,hi16(0xfffffc00)
72 ori r31,r31,lo16(0xfffffc00)
73 mtspr 256,r31 ; declare VRs in vrsave
75 cmplw cr0,r8,r4 ; i<data_len
78 ; load coefficients into v0-v7 and initial history into v8-v15
80 and r31,r8,r31 ; r31: data%4
82 subf r31,r31,r11 ; r31: 4-(data%4)
83 slwi r31,r31,3 ; convert to bits for vsro
88 vsro v18,v18,v0 ; v18: mask vector
98 vsel v0,v1,v0,v2 ; v0: reversal permutation vector
101 lvsl v17,0,r5 ; v17: coefficient alignment permutation vector
102 vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector
105 lvsl v16,0,r11 ; v16: history alignment permutation vector
118 addis r31,0,hi16(L1307)
119 ori r31,r31,lo16(L1307)
132 addis r31,0,hi16(L1306)
133 ori r31,r31,lo16(L1306)
142 vperm v10,v11,v10,v16
146 addis r31,0,hi16(L1305)
147 ori r31,r31,lo16(L1305)
156 vperm v11,v12,v11,v16
160 addis r31,0,hi16(L1304)
161 ori r31,r31,lo16(L1304)
170 vperm v12,v13,v12,v16
174 addis r31,0,hi16(L1303)
175 ori r31,r31,lo16(L1303)
184 vperm v13,v14,v13,v16
188 addis r31,0,hi16(L1302)
189 ori r31,r31,lo16(L1302)
198 vperm v14,v15,v14,v16
202 addis r31,0,hi16(L1301)
203 ori r31,r31,lo16(L1301)
212 vperm v15,v19,v15,v16
214 addis r31,0,hi16(L1300)
215 ori r31,r31,lo16(L1300)
220 ; set up invariant vectors
221 vspltish v16,0 ; v16: zero vector
224 lvsr v17,r10,r8 ; v17: result shift vector
225 lvsl v18,r10,r3 ; v18: residual shift back vector
229 lvewx v19,r10,r9 ; v19: lp_quantization vector
232 vmulosh v20,v0,v8 ; v20: sum vector
237 vsldoi v15,v15,v14,4 ; increment history
271 vsumsws v20,v20,v16 ; v20[3]: sum
272 vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization
274 lvewx v21,0,r3 ; v21[n]: *residual
275 vperm v21,v21,v21,v18 ; v21[3]: *residual
276 vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization)
277 vsldoi v18,v18,v18,4 ; increment shift vector
279 vperm v21,v20,v20,v17 ; v21[n]: shift for storage
280 vsldoi v17,v17,v17,12 ; increment shift vector
283 vsldoi v20,v20,v20,12
284 vsldoi v8,v8,v20,4 ; insert value onto history
288 cmplw cr0,r8,r4 ; i<data_len
292 mtspr 256,r0 ; restore old vrsave
296 _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8:
301 ; r7: lp_quantization
304 ; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above
305 ; this version assumes order<=8; it uses fewer vector registers, which should
306 ; save time in context switches, and has less code, which may improve
307 ; instruction caching
313 andc r9,r9,r31 ; for quadword-aligned stack data
315 slwi r6,r6,2 ; adjust for word size
317 add r4,r4,r8 ; r4 = data+data_len
319 mfspr r0,256 ; cache old vrsave
320 addis r31,0,hi16(0xffc00000)
321 ori r31,r31,lo16(0xffc00000)
322 mtspr 256,r31 ; declare VRs in vrsave
324 cmplw cr0,r8,r4 ; i<data_len
327 ; load coefficients into v0-v1 and initial history into v2-v3
329 and r31,r8,r31 ; r31: data%4
331 subf r31,r31,r11 ; r31: 4-(data%4)
332 slwi r31,r31,3 ; convert to bits for vsro
337 vsro v6,v6,v0 ; v6: mask vector
347 vsel v0,v1,v0,v2 ; v0: reversal permutation vector
350 lvsl v5,0,r5 ; v5: coefficient alignment permutation vector
351 vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector
354 lvsl v4,0,r11 ; v4: history alignment permutation vector
367 addis r31,0,hi16(L2301)
368 ori r31,r31,lo16(L2301)
379 addis r31,0,hi16(L2300)
380 ori r31,r31,lo16(L2300)
385 ; set up invariant vectors
386 vspltish v4,0 ; v4: zero vector
389 lvsr v5,r10,r8 ; v5: result shift vector
390 lvsl v6,r10,r3 ; v6: residual shift back vector
394 lvewx v7,r10,r9 ; v7: lp_quantization vector
397 vmulosh v8,v0,v2 ; v8: sum vector
406 vsumsws v8,v8,v4 ; v8[3]: sum
407 vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization
409 lvewx v9,0,r3 ; v9[n]: *residual
410 vperm v9,v9,v9,v6 ; v9[3]: *residual
411 vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization)
412 vsldoi v6,v6,v6,4 ; increment shift vector
414 vperm v9,v8,v8,v5 ; v9[n]: shift for storage
415 vsldoi v5,v5,v5,12 ; increment shift vector
419 vsldoi v2,v2,v8,4 ; insert value onto history
423 cmplw cr0,r8,r4 ; i<data_len
427 mtspr 256,r0 ; restore old vrsave