src/libFLAC/ia32/fixed_asm.nasm

   1 ;  vim:filetype=nasm ts=8
   2
   3 ;  libFLAC - Free Lossless Audio Codec library
   4 ;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2008,2009  Josh Coalson
   5 ;
   6 ;  Redistribution and use in source and binary forms, with or without
   7 ;  modification, are permitted provided that the following conditions
   8 ;  are met:
   9 ;
  10 ;  - Redistributions of source code must retain the above copyright
  11 ;  notice, this list of conditions and the following disclaimer.
  12 ;
  13 ;  - Redistributions in binary form must reproduce the above copyright
  14 ;  notice, this list of conditions and the following disclaimer in the
  15 ;  documentation and/or other materials provided with the distribution.
  16 ;
  17 ;  - Neither the name of the Xiph.org Foundation nor the names of its
  18 ;  contributors may be used to endorse or promote products derived from
  19 ;  this software without specific prior written permission.
  20 ;
  21 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
  25 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  26 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  27 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32
  33 %include "nasm.h"
  34
  35         data_section
  36
  37 cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
  38
  39         code_section
  40
  41 ; **********************************************************************
  42 ;
  43 ; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
  44 ; {
  45 ;       FLAC__int32 last_error_0 = data[-1];
  46 ;       FLAC__int32 last_error_1 = data[-1] - data[-2];
  47 ;       FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
  48 ;       FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
  49 ;       FLAC__int32 error, save;
  50 ;       FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
  51 ;       unsigned i, order;
  52 ;
  53 ;       for(i = 0; i < data_len; i++) {
  54 ;               error  = data[i]     ; total_error_0 += local_abs(error);                      save = error;
  55 ;               error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
  56 ;               error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
  57 ;               error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
  58 ;               error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
  59 ;       }
  60 ;
  61 ;       if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
  62 ;               order = 0;
  63 ;       else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
  64 ;               order = 1;
  65 ;       else if(total_error_2 < min(total_error_3, total_error_4))
  66 ;               order = 2;
  67 ;       else if(total_error_3 < total_error_4)
  68 ;               order = 3;
  69 ;       else
  70 ;               order = 4;
  71 ;
  72 ;       residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
  73 ;       residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
  74 ;       residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
  75 ;       residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
  76 ;       residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
  77 ;
  78 ;       return order;
  79 ; }
  80         ALIGN 16
  81 cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov
  82
  83         ; esp + 36 == data[]
  84         ; esp + 40 == data_len
  85         ; esp + 44 == residual_bits_per_sample[]
  86
  87         push    ebp
  88         push    ebx
  89         push    esi
  90         push    edi
  91         sub     esp, byte 16
  92         ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs
  93
  94         ; ebx == &data[i]
  95         ; ecx == loop counter (i)
  96         ; ebp == order
  97         ; mm0 == total_error_1:total_error_0
  98         ; mm1 == total_error_2:total_error_3
  99         ; mm2 == :total_error_4
 100         ; mm3 == last_error_1:last_error_0
 101         ; mm4 == last_error_2:last_error_3
 102
 103         mov     ecx, [esp + 40]                 ; ecx = data_len
 104         test    ecx, ecx
 105         jz      near .data_len_is_0
 106
 107         mov     ebx, [esp + 36]                 ; ebx = data[]
 108         movd    mm3, [ebx - 4]                  ; mm3 = 0:last_error_0
 109         movd    mm2, [ebx - 8]                  ; mm2 = 0:data[-2]
 110         movd    mm1, [ebx - 12]                 ; mm1 = 0:data[-3]
 111         movd    mm0, [ebx - 16]                 ; mm0 = 0:data[-4]
 112         movq    mm5, mm3                        ; mm5 = 0:last_error_0
 113         psubd   mm5, mm2                        ; mm5 = 0:last_error_1
 114         punpckldq       mm3, mm5                ; mm3 = last_error_1:last_error_0
 115         psubd   mm2, mm1                        ; mm2 = 0:data[-2] - data[-3]
 116         psubd   mm5, mm2                        ; mm5 = 0:last_error_2
 117         movq    mm4, mm5                        ; mm4 = 0:last_error_2
 118         psubd   mm4, mm2                        ; mm4 = 0:last_error_2 - (data[-2] - data[-3])
 119         paddd   mm4, mm1                        ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3])
 120         psubd   mm4, mm0                        ; mm4 = 0:last_error_3
 121         punpckldq       mm4, mm5                ; mm4 = last_error_2:last_error_3
 122         pxor    mm0, mm0                        ; mm0 = total_error_1:total_error_0
 123         pxor    mm1, mm1                        ; mm1 = total_error_2:total_error_3
 124         pxor    mm2, mm2                        ; mm2 = 0:total_error_4
 125
 126         ALIGN 16
 127 .loop:
 128         movd    mm7, [ebx]                      ; mm7 = 0:error_0
 129         add     ebx, byte 4
 130         movq    mm6, mm7                        ; mm6 = 0:error_0
 131         psubd   mm7, mm3                        ; mm7 = :error_1
 132         punpckldq       mm6, mm7                ; mm6 = error_1:error_0
 133         movq    mm5, mm6                        ; mm5 = error_1:error_0
 134         movq    mm7, mm6                        ; mm7 = error_1:error_0
 135         psubd   mm5, mm3                        ; mm5 = error_2:
 136         movq    mm3, mm6                        ; mm3 = error_1:error_0
 137         psrad   mm6, 31
 138         pxor    mm7, mm6
 139         psubd   mm7, mm6                        ; mm7 = abs(error_1):abs(error_0)
 140         paddd   mm0, mm7                        ; mm0 = total_error_1:total_error_0
 141         movq    mm6, mm5                        ; mm6 = error_2:
 142         psubd   mm5, mm4                        ; mm5 = error_3:
 143         punpckhdq       mm5, mm6                ; mm5 = error_2:error_3
 144         movq    mm7, mm5                        ; mm7 = error_2:error_3
 145         movq    mm6, mm5                        ; mm6 = error_2:error_3
 146         psubd   mm5, mm4                        ; mm5 = :error_4
 147         movq    mm4, mm6                        ; mm4 = error_2:error_3
 148         psrad   mm6, 31
 149         pxor    mm7, mm6
 150         psubd   mm7, mm6                        ; mm7 = abs(error_2):abs(error_3)
 151         paddd   mm1, mm7                        ; mm1 = total_error_2:total_error_3
 152         movq    mm6, mm5                        ; mm6 = :error_4
 153         psrad   mm5, 31
 154         pxor    mm6, mm5
 155         psubd   mm6, mm5                        ; mm6 = :abs(error_4)
 156         paddd   mm2, mm6                        ; mm2 = :total_error_4
 157
 158         dec     ecx
 159         jnz     short .loop
 160
 161 ;       if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4))
 162 ;               order = 0;
 163 ;       else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4))
 164 ;               order = 1;
 165 ;       else if(total_error_2 < min(total_error_3, total_error_4))
 166 ;               order = 2;
 167 ;       else if(total_error_3 < total_error_4)
 168 ;               order = 3;
 169 ;       else
 170 ;               order = 4;
 171         movq    mm3, mm0                        ; mm3 = total_error_1:total_error_0
 172         movd    edi, mm2                        ; edi = total_error_4
 173         movd    esi, mm1                        ; esi = total_error_3
 174         movd    eax, mm0                        ; eax = total_error_0
 175         punpckhdq       mm1, mm1                ; mm1 = total_error_2:total_error_2
 176         punpckhdq       mm3, mm3                ; mm3 = total_error_1:total_error_1
 177         movd    edx, mm1                        ; edx = total_error_2
 178         movd    ecx, mm3                        ; ecx = total_error_1
 179
 180         xor     ebx, ebx
 181         xor     ebp, ebp
 182         inc     ebx
 183         cmp     ecx, eax
 184         cmovb   eax, ecx                        ; eax = min(total_error_0, total_error_1)
 185         cmovbe  ebp, ebx
 186         inc     ebx
 187         cmp     edx, eax
 188         cmovb   eax, edx                        ; eax = min(total_error_0, total_error_1, total_error_2)
 189         cmovbe  ebp, ebx
 190         inc     ebx
 191         cmp     esi, eax
 192         cmovb   eax, esi                        ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3)
 193         cmovbe  ebp, ebx
 194         inc     ebx
 195         cmp     edi, eax
 196         cmovb   eax, edi                        ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4)
 197         cmovbe  ebp, ebx
 198         movd    ebx, mm0                        ; ebx = total_error_0
 199         emms
 200
 201         ;       residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
 202         ;       residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
 203         ;       residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
 204         ;       residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
 205         ;       residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
 206         xor     eax, eax
 207         fild    dword [esp + 40]                ; ST = data_len (NOTE: assumes data_len is <2gigs)
 208 .rbps_0:
 209         test    ebx, ebx
 210         jz      .total_error_0_is_0
 211         fld1                                    ; ST = 1.0 data_len
 212         mov     [esp], ebx
 213         mov     [esp + 4], eax                  ; [esp] = (FLAC__uint64)total_error_0
 214         mov     ebx, [esp + 44]
 215         fild    qword [esp]                     ; ST = total_error_0 1.0 data_len
 216         fdiv    st2                             ; ST = total_error_0/data_len 1.0 data_len
 217         fldln2                                  ; ST = ln2 total_error_0/data_len 1.0 data_len
 218         fmulp   st1                             ; ST = ln2*total_error_0/data_len 1.0 data_len
 219         fyl2x                                   ; ST = log2(ln2*total_error_0/data_len) data_len
 220         fstp    dword [ebx]                     ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len)   ST = data_len
 221         jmp     short .rbps_1
 222 .total_error_0_is_0:
 223         mov     ebx, [esp + 44]
 224         mov     [ebx], eax                      ; residual_bits_per_sample[0] = 0.0
 225 .rbps_1:
 226         test    ecx, ecx
 227         jz      .total_error_1_is_0
 228         fld1                                    ; ST = 1.0 data_len
 229         mov     [esp], ecx
 230         mov     [esp + 4], eax                  ; [esp] = (FLAC__uint64)total_error_1
 231         fild    qword [esp]                     ; ST = total_error_1 1.0 data_len
 232         fdiv    st2                             ; ST = total_error_1/data_len 1.0 data_len
 233         fldln2                                  ; ST = ln2 total_error_1/data_len 1.0 data_len
 234         fmulp   st1                             ; ST = ln2*total_error_1/data_len 1.0 data_len
 235         fyl2x                                   ; ST = log2(ln2*total_error_1/data_len) data_len
 236         fstp    dword [ebx + 4]                 ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len)   ST = data_len
 237         jmp     short .rbps_2
 238 .total_error_1_is_0:
 239         mov     [ebx + 4], eax                  ; residual_bits_per_sample[1] = 0.0
 240 .rbps_2:
 241         test    edx, edx
 242         jz      .total_error_2_is_0
 243         fld1                                    ; ST = 1.0 data_len
 244         mov     [esp], edx
 245         mov     [esp + 4], eax                  ; [esp] = (FLAC__uint64)total_error_2
 246         fild    qword [esp]                     ; ST = total_error_2 1.0 data_len
 247         fdiv    st2                             ; ST = total_error_2/data_len 1.0 data_len
 248         fldln2                                  ; ST = ln2 total_error_2/data_len 1.0 data_len
 249         fmulp   st1                             ; ST = ln2*total_error_2/data_len 1.0 data_len
 250         fyl2x                                   ; ST = log2(ln2*total_error_2/data_len) data_len
 251         fstp    dword [ebx + 8]                 ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len)   ST = data_len
 252         jmp     short .rbps_3
 253 .total_error_2_is_0:
 254         mov     [ebx + 8], eax                  ; residual_bits_per_sample[2] = 0.0
 255 .rbps_3:
 256         test    esi, esi
 257         jz      .total_error_3_is_0
 258         fld1                                    ; ST = 1.0 data_len
 259         mov     [esp], esi
 260         mov     [esp + 4], eax                  ; [esp] = (FLAC__uint64)total_error_3
 261         fild    qword [esp]                     ; ST = total_error_3 1.0 data_len
 262         fdiv    st2                             ; ST = total_error_3/data_len 1.0 data_len
 263         fldln2                                  ; ST = ln2 total_error_3/data_len 1.0 data_len
 264         fmulp   st1                             ; ST = ln2*total_error_3/data_len 1.0 data_len
 265         fyl2x                                   ; ST = log2(ln2*total_error_3/data_len) data_len
 266         fstp    dword [ebx + 12]                ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len)   ST = data_len
 267         jmp     short .rbps_4
 268 .total_error_3_is_0:
 269         mov     [ebx + 12], eax                 ; residual_bits_per_sample[3] = 0.0
 270 .rbps_4:
 271         test    edi, edi
 272         jz      .total_error_4_is_0
 273         fld1                                    ; ST = 1.0 data_len
 274         mov     [esp], edi
 275         mov     [esp + 4], eax                  ; [esp] = (FLAC__uint64)total_error_4
 276         fild    qword [esp]                     ; ST = total_error_4 1.0 data_len
 277         fdiv    st2                             ; ST = total_error_4/data_len 1.0 data_len
 278         fldln2                                  ; ST = ln2 total_error_4/data_len 1.0 data_len
 279         fmulp   st1                             ; ST = ln2*total_error_4/data_len 1.0 data_len
 280         fyl2x                                   ; ST = log2(ln2*total_error_4/data_len) data_len
 281         fstp    dword [ebx + 16]                ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len)   ST = data_len
 282         jmp     short .rbps_end
 283 .total_error_4_is_0:
 284         mov     [ebx + 16], eax                 ; residual_bits_per_sample[4] = 0.0
 285 .rbps_end:
 286         fstp    st0                             ; ST = [empty]
 287         jmp     short .end
 288 .data_len_is_0:
 289         ; data_len == 0, so residual_bits_per_sample[*] = 0.0
 290         xor     ebp, ebp
 291         mov     edi, [esp + 44]
 292         mov     [edi], ebp
 293         mov     [edi + 4], ebp
 294         mov     [edi + 8], ebp
 295         mov     [edi + 12], ebp
 296         mov     [edi + 16], ebp
 297         add     ebp, byte 4                     ; order = 4
 298
 299 .end:
 300         mov     eax, ebp                        ; return order
 301         add     esp, byte 16
 302         pop     edi
 303         pop     esi
 304         pop     ebx
 305         pop     ebp
 306         ret
 307
 308 end