; vim:filetype=nasm ts=8 ; libFLAC - Free Lossless Audio Codec library ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2008,2009 Josh Coalson ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; ; - Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; - Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; ; - Neither the name of the Xiph.org Foundation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "nasm.h" data_section cglobal precompute_partition_info_sums_32bit_asm_ia32_ code_section ; ********************************************************************** ; ; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter) ; void precompute_partition_info_sums_32bit_( ; const FLAC__int32 residual[], ; FLAC__uint64 abs_residual_partition_sums[], ; unsigned blocksize, ; unsigned predictor_order, ; unsigned min_partition_order, ; unsigned max_partition_order ; ) ; ALIGN 16 cident precompute_partition_info_sums_32bit_asm_ia32_ ;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time ;; [esp + 4] const FLAC__int32 residual[] ;; [esp + 8] FLAC__uint64 abs_residual_partition_sums[] ;; [esp + 12] unsigned blocksize ;; [esp + 16] unsigned predictor_order ;; [esp + 20] unsigned min_partition_order ;; [esp + 24] unsigned max_partition_order push ebp push ebx push esi push edi sub esp, 8 ;; [esp + 28] const FLAC__int32 residual[] ;; [esp + 32] FLAC__uint64 abs_residual_partition_sums[] ;; [esp + 36] unsigned blocksize ;; [esp + 40] unsigned predictor_order ;; [esp + 44] unsigned min_partition_order ;; [esp + 48] unsigned max_partition_order ;; [esp] partitions ;; [esp + 4] default_partition_samples mov ecx, [esp + 48] mov eax, 1 shl eax, cl mov [esp], eax ; [esp] <- partitions = 1u << max_partition_order; mov eax, [esp + 36] shr eax, cl mov [esp + 4], eax ; [esp + 4] <- default_partition_samples = blocksize >> max_partition_order; ; ; first do max_partition_order ; mov edi, [esp + 4] sub edi, [esp + 40] ; edi <- end = (unsigned)(-(int)predictor_order) + default_partition_samples xor esi, esi ; esi <- residual_sample = 0 xor ecx, ecx ; ecx <- partition = 0 mov ebp, [esp + 28] ; ebp <- residual[] xor ebx, ebx ; ebx <- abs_residual_partition_sum = 0; ; note we put the updates to 'end' and 'abs_residual_partition_sum' at the end of loop0 and in the initialization above so we could align loop0 and loop1 ALIGN 16 .loop0: ; for(partition = residual_sample = 0; partition < partitions; partition++) { .loop1: ; for( ; residual_sample < end; residual_sample++) mov eax, [ebp + esi * 4] cdq xor eax, edx sub eax, edx add ebx, eax ; abs_residual_partition_sum += abs(residual[residual_sample]); ;@@@@@@ check overflow flag and abort here? add esi, byte 1 cmp esi, edi ; /* since the loop will always run at least once, we can put the loop check down here */ jb .loop1 .next1: add edi, [esp + 4] ; end += default_partition_samples; mov eax, [esp + 32] mov [eax + ecx * 8], ebx ; abs_residual_partition_sums[partition] = abs_residual_partition_sum; mov [eax + ecx * 8 + 4], dword 0 xor ebx, ebx ; abs_residual_partition_sum = 0; add ecx, byte 1 cmp ecx, [esp] ; /* since the loop will always run at least once, we can put the loop check down here */ jb .loop0 .next0: ; } ; ; now merge partitions for lower orders ; mov esi, [esp + 32] ; esi <- abs_residual_partition_sums[from_partition==0]; mov eax, [esp] lea edi, [esi + eax * 8] ; edi <- abs_residual_partition_sums[to_partition==partitions]; mov ecx, [esp + 48] sub ecx, byte 1 ; ecx <- partition_order = (int)max_partition_order - 1; ALIGN 16 .loop2: ; for(; partition_order >= (int)min_partition_order; partition_order--) { cmp ecx, [esp + 44] jl .next2 mov edx, 1 shl edx, cl ; const unsigned partitions = 1u << partition_order; ALIGN 16 .loop3: ; for(i = 0; i < partitions; i++) { mov eax, [esi] mov ebx, [esi + 4] add eax, [esi + 8] adc ebx, [esi + 12] mov [edi], eax mov [edi + 4], ebx ; a_r_p_s[to_partition] = a_r_p_s[from_partition] + a_r_p_s[from_partition+1]; add esi, byte 16 add edi, byte 8 sub edx, byte 1 jnz .loop3 ; } sub ecx, byte 1 jmp .loop2 ; } .next2: add esp, 8 pop edi pop esi pop ebx pop ebp ret end