1 ; vim:filetype=nasm ts=8
3 ; libFLAC - Free Lossless Audio Codec library
4 ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions
10 ; - Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; - Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the distribution.
17 ; - Neither the name of the Xiph.org Foundation nor the names of its
18 ; contributors may be used to endorse or promote products derived from
19 ; this software without specific prior written permission.
21 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
41 ; **********************************************************************
43 ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
45 ; FLAC__int32 last_error_0 = data[-1];
46 ; FLAC__int32 last_error_1 = data[-1] - data[-2];
47 ; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
48 ; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
49 ; FLAC__int32 error, save;
50 ; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
53 ; for(i = 0; i < data_len; i++) {
54 ; error = data[i] ; total_error_0 += local_abs(error); save = error;
55 ; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
56 ; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
57 ; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
58 ; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
61 ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
63 ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
65 ; else if(total_error_2 < min(total_error_3, total_error_4))
67 ; else if(total_error_3 < total_error_4)
72 ; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
73 ; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
74 ; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
75 ; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
76 ; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
81 cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
84 ; esp + 40 == data_len
85 ; esp + 44 == residual_bits_per_sample[]
92 ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
95 ; ecx == loop counter (i)
97 ; mm0 == total_error_1:total_error_0
98 ; mm1 == total_error_2:total_error_3
99 ; mm2 == :total_error_4
100 ; mm3 == last_error_1:last_error_0
101 ; mm4 == last_error_2:last_error_3
103 mov ecx, [esp + 40] ; ecx = data_len
105 jz near .data_len_is_0
107 mov ebx, [esp + 36] ; ebx = data[]
108 movd mm3, [ebx - 4] ; mm3 = 0:last_error_0
109 movd mm2, [ebx - 8] ; mm2 = 0:data[-2]
110 movd mm1, [ebx - 12] ; mm1 = 0:data[-3]
111 movd mm0, [ebx - 16] ; mm0 = 0:data[-4]
112 movq mm5, mm3 ; mm5 = 0:last_error_0
113 psubd mm5, mm2 ; mm5 = 0:last_error_1
114 punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0
115 psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3]
116 psubd mm5, mm2 ; mm5 = 0:last_error_2
117 movq mm4, mm5 ; mm4 = 0:last_error_2
118 psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3])
119 paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
120 psubd mm4, mm0 ; mm4 = 0:last_error_3
121 punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3
122 pxor mm0, mm0 ; mm0 = total_error_1:total_error_0
123 pxor mm1, mm1 ; mm1 = total_error_2:total_error_3
124 pxor mm2, mm2 ; mm2 = 0:total_error_4
128 movd mm7, [ebx] ; mm7 = 0:error_0
130 movq mm6, mm7 ; mm6 = 0:error_0
131 psubd mm7, mm3 ; mm7 = :error_1
132 punpckldq mm6, mm7 ; mm6 = error_1:error_0
133 movq mm5, mm6 ; mm5 = error_1:error_0
134 movq mm7, mm6 ; mm7 = error_1:error_0
135 psubd mm5, mm3 ; mm5 = error_2:
136 movq mm3, mm6 ; mm3 = error_1:error_0
139 psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0)
140 paddd mm0, mm7 ; mm0 = total_error_1:total_error_0
141 movq mm6, mm5 ; mm6 = error_2:
142 psubd mm5, mm4 ; mm5 = error_3:
143 punpckhdq mm5, mm6 ; mm5 = error_2:error_3
144 movq mm7, mm5 ; mm7 = error_2:error_3
145 movq mm6, mm5 ; mm6 = error_2:error_3
146 psubd mm5, mm4 ; mm5 = :error_4
147 movq mm4, mm6 ; mm4 = error_2:error_3
150 psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3)
151 paddd mm1, mm7 ; mm1 = total_error_2:total_error_3
152 movq mm6, mm5 ; mm6 = :error_4
155 psubd mm6, mm5 ; mm6 = :abs(error_4)
156 paddd mm2, mm6 ; mm2 = :total_error_4
161 ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
163 ; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
165 ; else if(total_error_2 < min(total_error_3, total_error_4))
167 ; else if(total_error_3 < total_error_4)
171 movq mm3, mm0 ; mm3 = total_error_1:total_error_0
172 movd edi, mm2 ; edi = total_error_4
173 movd esi, mm1 ; esi = total_error_3
174 movd eax, mm0 ; eax = total_error_0
175 punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2
176 punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1
177 movd edx, mm1 ; edx = total_error_2
178 movd ecx, mm3 ; ecx = total_error_1
184 cmovb eax, ecx ; eax = min(total_error_0, total_error_1)
188 cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2)
192 cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
196 cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
198 movd ebx, mm0 ; ebx = total_error_0
201 ; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
202 ; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
203 ; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
204 ; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
205 ; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
207 fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs)
210 jz .total_error_0_is_0
211 fld1 ; ST = 1.0 data_len
213 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0
215 fild qword [esp] ; ST = total_error_0 1.0 data_len
216 fdiv st2 ; ST = total_error_0/data_len 1.0 data_len
217 fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len
218 fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len
219 fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len
220 fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len
224 mov [ebx], eax ; residual_bits_per_sample[0] = 0.0
227 jz .total_error_1_is_0
228 fld1 ; ST = 1.0 data_len
230 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1
231 fild qword [esp] ; ST = total_error_1 1.0 data_len
232 fdiv st2 ; ST = total_error_1/data_len 1.0 data_len
233 fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len
234 fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len
235 fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len
236 fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len
239 mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0
242 jz .total_error_2_is_0
243 fld1 ; ST = 1.0 data_len
245 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2
246 fild qword [esp] ; ST = total_error_2 1.0 data_len
247 fdiv st2 ; ST = total_error_2/data_len 1.0 data_len
248 fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len
249 fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len
250 fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len
251 fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len
254 mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0
257 jz .total_error_3_is_0
258 fld1 ; ST = 1.0 data_len
260 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3
261 fild qword [esp] ; ST = total_error_3 1.0 data_len
262 fdiv st2 ; ST = total_error_3/data_len 1.0 data_len
263 fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len
264 fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len
265 fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len
266 fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len
269 mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0
272 jz .total_error_4_is_0
273 fld1 ; ST = 1.0 data_len
275 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4
276 fild qword [esp] ; ST = total_error_4 1.0 data_len
277 fdiv st2 ; ST = total_error_4/data_len 1.0 data_len
278 fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len
279 fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len
280 fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len
281 fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len
284 mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0
286 fstp st0 ; ST = [empty]
289 ; data_len == 0, so residual_bits_per_sample[*] = 0.0
297 add ebp, byte 4 ; order = 4
300 mov eax, ebp ; return order
310 %ifdef OBJ_FORMAT_elf
311 section .note.GNU-stack noalloc