src/libFLAC/ia32/lpc_asm.nasm

   1 ;  libFLAC - Free Lossless Audio Codec library
   2 ;  Copyright (C) 2001,2002,2003,2004,2005,2006  Josh Coalson
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;
   8 ;  - Redistributions of source code must retain the above copyright
   9 ;  notice, this list of conditions and the following disclaimer.
  10 ;
  11 ;  - Redistributions in binary form must reproduce the above copyright
  12 ;  notice, this list of conditions and the following disclaimer in the
  13 ;  documentation and/or other materials provided with the distribution.
  14 ;
  15 ;  - Neither the name of the Xiph.org Foundation nor the names of its
  16 ;  contributors may be used to endorse or promote products derived from
  17 ;  this software without specific prior written permission.
  18 ;
  19 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  20 ;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  21 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  22 ;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
  23 ;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  24 ;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  25 ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  26 ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  27 ;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  28 ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  29 ;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  30
  31 %include "nasm.h"
  32
  33         data_section
  34
  35 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
  36 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
  37 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
  38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
  39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
  40 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
  41 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
  42 cglobal FLAC__lpc_restore_signal_asm_ia32
  43 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
  44
  45         code_section
  46
  47 ; **********************************************************************
  48 ;
  49 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
  50 ; {
  51 ;       FLAC__real d;
  52 ;       unsigned sample, coeff;
  53 ;       const unsigned limit = data_len - lag;
  54 ;
  55 ;       FLAC__ASSERT(lag > 0);
  56 ;       FLAC__ASSERT(lag <= data_len);
  57 ;
  58 ;       for(coeff = 0; coeff < lag; coeff++)
  59 ;               autoc[coeff] = 0.0;
  60 ;       for(sample = 0; sample <= limit; sample++) {
  61 ;               d = data[sample];
  62 ;               for(coeff = 0; coeff < lag; coeff++)
  63 ;                       autoc[coeff] += d * data[sample+coeff];
  64 ;       }
  65 ;       for(; sample < data_len; sample++) {
  66 ;               d = data[sample];
  67 ;               for(coeff = 0; coeff < data_len - sample; coeff++)
  68 ;                       autoc[coeff] += d * data[sample+coeff];
  69 ;       }
  70 ; }
  71 ;
  72         ALIGN 16
  73 cident FLAC__lpc_compute_autocorrelation_asm_ia32
  74         ;[esp + 28] == autoc[]
  75         ;[esp + 24] == lag
  76         ;[esp + 20] == data_len
  77         ;[esp + 16] == data[]
  78
  79         ;ASSERT(lag > 0)
  80         ;ASSERT(lag <= 33)
  81         ;ASSERT(lag <= data_len)
  82
  83 .begin:
  84         push    esi
  85         push    edi
  86         push    ebx
  87
  88         ;       for(coeff = 0; coeff < lag; coeff++)
  89         ;               autoc[coeff] = 0.0;
  90         mov     edi, [esp + 28]                 ; edi == autoc
  91         mov     ecx, [esp + 24]                 ; ecx = # of dwords (=lag) of 0 to write
  92         xor     eax, eax
  93         rep     stosd
  94
  95         ;       const unsigned limit = data_len - lag;
  96         mov     eax, [esp + 24]                 ; eax == lag
  97         mov     ecx, [esp + 20]
  98         sub     ecx, eax                        ; ecx == limit
  99
 100         mov     edi, [esp + 28]                 ; edi == autoc
 101         mov     esi, [esp + 16]                 ; esi == data
 102         inc     ecx                             ; we are looping <= limit so we add one to the counter
 103
 104         ;       for(sample = 0; sample <= limit; sample++) {
 105         ;               d = data[sample];
 106         ;               for(coeff = 0; coeff < lag; coeff++)
 107         ;                       autoc[coeff] += d * data[sample+coeff];
 108         ;       }
 109         fld     dword [esi]                     ; ST = d <- data[sample]
 110         ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 111         lea     edx, [eax + eax*2]
 112         neg     edx
 113         lea     edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
 114         call    .get_eip1
 115 .get_eip1:
 116         pop     ebx
 117         add     edx, ebx
 118         inc     edx                             ; compensate for the shorter opcode on the last iteration
 119         inc     edx                             ; compensate for the shorter opcode on the last iteration
 120         inc     edx                             ; compensate for the shorter opcode on the last iteration
 121         cmp     eax, 33
 122         jne     .loop1_start
 123         sub     edx, byte 9                     ; compensate for the longer opcodes on the first iteration
 124 .loop1_start:
 125         jmp     edx
 126
 127         fld     st0                             ; ST = d d
 128         fmul    dword [esi + (32*4)]            ; ST = d*data[sample+32] d              WATCHOUT: not a byte displacement here!
 129         fadd    dword [edi + (32*4)]            ; ST = autoc[32]+d*data[sample+32] d    WATCHOUT: not a byte displacement here!
 130         fstp    dword [edi + (32*4)]            ; autoc[32]+=d*data[sample+32]  ST = d  WATCHOUT: not a byte displacement here!
 131         fld     st0                             ; ST = d d
 132         fmul    dword [esi + (31*4)]            ; ST = d*data[sample+31] d
 133         fadd    dword [edi + (31*4)]            ; ST = autoc[31]+d*data[sample+31] d
 134         fstp    dword [edi + (31*4)]            ; autoc[31]+=d*data[sample+31]  ST = d
 135         fld     st0                             ; ST = d d
 136         fmul    dword [esi + (30*4)]            ; ST = d*data[sample+30] d
 137         fadd    dword [edi + (30*4)]            ; ST = autoc[30]+d*data[sample+30] d
 138         fstp    dword [edi + (30*4)]            ; autoc[30]+=d*data[sample+30]  ST = d
 139         fld     st0                             ; ST = d d
 140         fmul    dword [esi + (29*4)]            ; ST = d*data[sample+29] d
 141         fadd    dword [edi + (29*4)]            ; ST = autoc[29]+d*data[sample+29] d
 142         fstp    dword [edi + (29*4)]            ; autoc[29]+=d*data[sample+29]  ST = d
 143         fld     st0                             ; ST = d d
 144         fmul    dword [esi + (28*4)]            ; ST = d*data[sample+28] d
 145         fadd    dword [edi + (28*4)]            ; ST = autoc[28]+d*data[sample+28] d
 146         fstp    dword [edi + (28*4)]            ; autoc[28]+=d*data[sample+28]  ST = d
 147         fld     st0                             ; ST = d d
 148         fmul    dword [esi + (27*4)]            ; ST = d*data[sample+27] d
 149         fadd    dword [edi + (27*4)]            ; ST = autoc[27]+d*data[sample+27] d
 150         fstp    dword [edi + (27*4)]            ; autoc[27]+=d*data[sample+27]  ST = d
 151         fld     st0                             ; ST = d d
 152         fmul    dword [esi + (26*4)]            ; ST = d*data[sample+26] d
 153         fadd    dword [edi + (26*4)]            ; ST = autoc[26]+d*data[sample+26] d
 154         fstp    dword [edi + (26*4)]            ; autoc[26]+=d*data[sample+26]  ST = d
 155         fld     st0                             ; ST = d d
 156         fmul    dword [esi + (25*4)]            ; ST = d*data[sample+25] d
 157         fadd    dword [edi + (25*4)]            ; ST = autoc[25]+d*data[sample+25] d
 158         fstp    dword [edi + (25*4)]            ; autoc[25]+=d*data[sample+25]  ST = d
 159         fld     st0                             ; ST = d d
 160         fmul    dword [esi + (24*4)]            ; ST = d*data[sample+24] d
 161         fadd    dword [edi + (24*4)]            ; ST = autoc[24]+d*data[sample+24] d
 162         fstp    dword [edi + (24*4)]            ; autoc[24]+=d*data[sample+24]  ST = d
 163         fld     st0                             ; ST = d d
 164         fmul    dword [esi + (23*4)]            ; ST = d*data[sample+23] d
 165         fadd    dword [edi + (23*4)]            ; ST = autoc[23]+d*data[sample+23] d
 166         fstp    dword [edi + (23*4)]            ; autoc[23]+=d*data[sample+23]  ST = d
 167         fld     st0                             ; ST = d d
 168         fmul    dword [esi + (22*4)]            ; ST = d*data[sample+22] d
 169         fadd    dword [edi + (22*4)]            ; ST = autoc[22]+d*data[sample+22] d
 170         fstp    dword [edi + (22*4)]            ; autoc[22]+=d*data[sample+22]  ST = d
 171         fld     st0                             ; ST = d d
 172         fmul    dword [esi + (21*4)]            ; ST = d*data[sample+21] d
 173         fadd    dword [edi + (21*4)]            ; ST = autoc[21]+d*data[sample+21] d
 174         fstp    dword [edi + (21*4)]            ; autoc[21]+=d*data[sample+21]  ST = d
 175         fld     st0                             ; ST = d d
 176         fmul    dword [esi + (20*4)]            ; ST = d*data[sample+20] d
 177         fadd    dword [edi + (20*4)]            ; ST = autoc[20]+d*data[sample+20] d
 178         fstp    dword [edi + (20*4)]            ; autoc[20]+=d*data[sample+20]  ST = d
 179         fld     st0                             ; ST = d d
 180         fmul    dword [esi + (19*4)]            ; ST = d*data[sample+19] d
 181         fadd    dword [edi + (19*4)]            ; ST = autoc[19]+d*data[sample+19] d
 182         fstp    dword [edi + (19*4)]            ; autoc[19]+=d*data[sample+19]  ST = d
 183         fld     st0                             ; ST = d d
 184         fmul    dword [esi + (18*4)]            ; ST = d*data[sample+18] d
 185         fadd    dword [edi + (18*4)]            ; ST = autoc[18]+d*data[sample+18] d
 186         fstp    dword [edi + (18*4)]            ; autoc[18]+=d*data[sample+18]  ST = d
 187         fld     st0                             ; ST = d d
 188         fmul    dword [esi + (17*4)]            ; ST = d*data[sample+17] d
 189         fadd    dword [edi + (17*4)]            ; ST = autoc[17]+d*data[sample+17] d
 190         fstp    dword [edi + (17*4)]            ; autoc[17]+=d*data[sample+17]  ST = d
 191         fld     st0                             ; ST = d d
 192         fmul    dword [esi + (16*4)]            ; ST = d*data[sample+16] d
 193         fadd    dword [edi + (16*4)]            ; ST = autoc[16]+d*data[sample+16] d
 194         fstp    dword [edi + (16*4)]            ; autoc[16]+=d*data[sample+16]  ST = d
 195         fld     st0                             ; ST = d d
 196         fmul    dword [esi + (15*4)]            ; ST = d*data[sample+15] d
 197         fadd    dword [edi + (15*4)]            ; ST = autoc[15]+d*data[sample+15] d
 198         fstp    dword [edi + (15*4)]            ; autoc[15]+=d*data[sample+15]  ST = d
 199         fld     st0                             ; ST = d d
 200         fmul    dword [esi + (14*4)]            ; ST = d*data[sample+14] d
 201         fadd    dword [edi + (14*4)]            ; ST = autoc[14]+d*data[sample+14] d
 202         fstp    dword [edi + (14*4)]            ; autoc[14]+=d*data[sample+14]  ST = d
 203         fld     st0                             ; ST = d d
 204         fmul    dword [esi + (13*4)]            ; ST = d*data[sample+13] d
 205         fadd    dword [edi + (13*4)]            ; ST = autoc[13]+d*data[sample+13] d
 206         fstp    dword [edi + (13*4)]            ; autoc[13]+=d*data[sample+13]  ST = d
 207         fld     st0                             ; ST = d d
 208         fmul    dword [esi + (12*4)]            ; ST = d*data[sample+12] d
 209         fadd    dword [edi + (12*4)]            ; ST = autoc[12]+d*data[sample+12] d
 210         fstp    dword [edi + (12*4)]            ; autoc[12]+=d*data[sample+12]  ST = d
 211         fld     st0                             ; ST = d d
 212         fmul    dword [esi + (11*4)]            ; ST = d*data[sample+11] d
 213         fadd    dword [edi + (11*4)]            ; ST = autoc[11]+d*data[sample+11] d
 214         fstp    dword [edi + (11*4)]            ; autoc[11]+=d*data[sample+11]  ST = d
 215         fld     st0                             ; ST = d d
 216         fmul    dword [esi + (10*4)]            ; ST = d*data[sample+10] d
 217         fadd    dword [edi + (10*4)]            ; ST = autoc[10]+d*data[sample+10] d
 218         fstp    dword [edi + (10*4)]            ; autoc[10]+=d*data[sample+10]  ST = d
 219         fld     st0                             ; ST = d d
 220         fmul    dword [esi + ( 9*4)]            ; ST = d*data[sample+9] d
 221         fadd    dword [edi + ( 9*4)]            ; ST = autoc[9]+d*data[sample+9] d
 222         fstp    dword [edi + ( 9*4)]            ; autoc[9]+=d*data[sample+9]  ST = d
 223         fld     st0                             ; ST = d d
 224         fmul    dword [esi + ( 8*4)]            ; ST = d*data[sample+8] d
 225         fadd    dword [edi + ( 8*4)]            ; ST = autoc[8]+d*data[sample+8] d
 226         fstp    dword [edi + ( 8*4)]            ; autoc[8]+=d*data[sample+8]  ST = d
 227         fld     st0                             ; ST = d d
 228         fmul    dword [esi + ( 7*4)]            ; ST = d*data[sample+7] d
 229         fadd    dword [edi + ( 7*4)]            ; ST = autoc[7]+d*data[sample+7] d
 230         fstp    dword [edi + ( 7*4)]            ; autoc[7]+=d*data[sample+7]  ST = d
 231         fld     st0                             ; ST = d d
 232         fmul    dword [esi + ( 6*4)]            ; ST = d*data[sample+6] d
 233         fadd    dword [edi + ( 6*4)]            ; ST = autoc[6]+d*data[sample+6] d
 234         fstp    dword [edi + ( 6*4)]            ; autoc[6]+=d*data[sample+6]  ST = d
 235         fld     st0                             ; ST = d d
 236         fmul    dword [esi + ( 5*4)]            ; ST = d*data[sample+4] d
 237         fadd    dword [edi + ( 5*4)]            ; ST = autoc[4]+d*data[sample+4] d
 238         fstp    dword [edi + ( 5*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 239         fld     st0                             ; ST = d d
 240         fmul    dword [esi + ( 4*4)]            ; ST = d*data[sample+4] d
 241         fadd    dword [edi + ( 4*4)]            ; ST = autoc[4]+d*data[sample+4] d
 242         fstp    dword [edi + ( 4*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 243         fld     st0                             ; ST = d d
 244         fmul    dword [esi + ( 3*4)]            ; ST = d*data[sample+3] d
 245         fadd    dword [edi + ( 3*4)]            ; ST = autoc[3]+d*data[sample+3] d
 246         fstp    dword [edi + ( 3*4)]            ; autoc[3]+=d*data[sample+3]  ST = d
 247         fld     st0                             ; ST = d d
 248         fmul    dword [esi + ( 2*4)]            ; ST = d*data[sample+2] d
 249         fadd    dword [edi + ( 2*4)]            ; ST = autoc[2]+d*data[sample+2] d
 250         fstp    dword [edi + ( 2*4)]            ; autoc[2]+=d*data[sample+2]  ST = d
 251         fld     st0                             ; ST = d d
 252         fmul    dword [esi + ( 1*4)]            ; ST = d*data[sample+1] d
 253         fadd    dword [edi + ( 1*4)]            ; ST = autoc[1]+d*data[sample+1] d
 254         fstp    dword [edi + ( 1*4)]            ; autoc[1]+=d*data[sample+1]  ST = d
 255         fld     st0                             ; ST = d d
 256         fmul    dword [esi]                     ; ST = d*data[sample] d                 WATCHOUT: no displacement byte here!
 257         fadd    dword [edi]                     ; ST = autoc[0]+d*data[sample] d        WATCHOUT: no displacement byte here!
 258         fstp    dword [edi]                     ; autoc[0]+=d*data[sample]  ST = d      WATCHOUT: no displacement byte here!
 259 .jumper1_0:
 260
 261         fstp    st0                             ; pop d, ST = empty
 262         add     esi, byte 4                     ; sample++
 263         dec     ecx
 264         jz      .loop1_end
 265         fld     dword [esi]                     ; ST = d <- data[sample]
 266         jmp     edx
 267 .loop1_end:
 268
 269         ;       for(; sample < data_len; sample++) {
 270         ;               d = data[sample];
 271         ;               for(coeff = 0; coeff < data_len - sample; coeff++)
 272         ;                       autoc[coeff] += d * data[sample+coeff];
 273         ;       }
 274         mov     ecx, [esp + 24]                 ; ecx <- lag
 275         dec     ecx                             ; ecx <- lag - 1
 276         jz      near .end                       ; skip loop if 0 (i.e. lag == 1)
 277
 278         fld     dword [esi]                     ; ST = d <- data[sample]
 279         mov     eax, ecx                        ; eax <- lag - 1 == data_len - sample the first time through
 280         ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 281         lea     edx, [eax + eax*2]
 282         neg     edx
 283         lea     edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
 284         call    .get_eip2
 285 .get_eip2:
 286         pop     ebx
 287         add     edx, ebx
 288         inc     edx                             ; compensate for the shorter opcode on the last iteration
 289         inc     edx                             ; compensate for the shorter opcode on the last iteration
 290         inc     edx                             ; compensate for the shorter opcode on the last iteration
 291         jmp     edx
 292
 293         fld     st0                             ; ST = d d
 294         fmul    dword [esi + (31*4)]            ; ST = d*data[sample+31] d
 295         fadd    dword [edi + (31*4)]            ; ST = autoc[31]+d*data[sample+31] d
 296         fstp    dword [edi + (31*4)]            ; autoc[31]+=d*data[sample+31]  ST = d
 297         fld     st0                             ; ST = d d
 298         fmul    dword [esi + (30*4)]            ; ST = d*data[sample+30] d
 299         fadd    dword [edi + (30*4)]            ; ST = autoc[30]+d*data[sample+30] d
 300         fstp    dword [edi + (30*4)]            ; autoc[30]+=d*data[sample+30]  ST = d
 301         fld     st0                             ; ST = d d
 302         fmul    dword [esi + (29*4)]            ; ST = d*data[sample+29] d
 303         fadd    dword [edi + (29*4)]            ; ST = autoc[29]+d*data[sample+29] d
 304         fstp    dword [edi + (29*4)]            ; autoc[29]+=d*data[sample+29]  ST = d
 305         fld     st0                             ; ST = d d
 306         fmul    dword [esi + (28*4)]            ; ST = d*data[sample+28] d
 307         fadd    dword [edi + (28*4)]            ; ST = autoc[28]+d*data[sample+28] d
 308         fstp    dword [edi + (28*4)]            ; autoc[28]+=d*data[sample+28]  ST = d
 309         fld     st0                             ; ST = d d
 310         fmul    dword [esi + (27*4)]            ; ST = d*data[sample+27] d
 311         fadd    dword [edi + (27*4)]            ; ST = autoc[27]+d*data[sample+27] d
 312         fstp    dword [edi + (27*4)]            ; autoc[27]+=d*data[sample+27]  ST = d
 313         fld     st0                             ; ST = d d
 314         fmul    dword [esi + (26*4)]            ; ST = d*data[sample+26] d
 315         fadd    dword [edi + (26*4)]            ; ST = autoc[26]+d*data[sample+26] d
 316         fstp    dword [edi + (26*4)]            ; autoc[26]+=d*data[sample+26]  ST = d
 317         fld     st0                             ; ST = d d
 318         fmul    dword [esi + (25*4)]            ; ST = d*data[sample+25] d
 319         fadd    dword [edi + (25*4)]            ; ST = autoc[25]+d*data[sample+25] d
 320         fstp    dword [edi + (25*4)]            ; autoc[25]+=d*data[sample+25]  ST = d
 321         fld     st0                             ; ST = d d
 322         fmul    dword [esi + (24*4)]            ; ST = d*data[sample+24] d
 323         fadd    dword [edi + (24*4)]            ; ST = autoc[24]+d*data[sample+24] d
 324         fstp    dword [edi + (24*4)]            ; autoc[24]+=d*data[sample+24]  ST = d
 325         fld     st0                             ; ST = d d
 326         fmul    dword [esi + (23*4)]            ; ST = d*data[sample+23] d
 327         fadd    dword [edi + (23*4)]            ; ST = autoc[23]+d*data[sample+23] d
 328         fstp    dword [edi + (23*4)]            ; autoc[23]+=d*data[sample+23]  ST = d
 329         fld     st0                             ; ST = d d
 330         fmul    dword [esi + (22*4)]            ; ST = d*data[sample+22] d
 331         fadd    dword [edi + (22*4)]            ; ST = autoc[22]+d*data[sample+22] d
 332         fstp    dword [edi + (22*4)]            ; autoc[22]+=d*data[sample+22]  ST = d
 333         fld     st0                             ; ST = d d
 334         fmul    dword [esi + (21*4)]            ; ST = d*data[sample+21] d
 335         fadd    dword [edi + (21*4)]            ; ST = autoc[21]+d*data[sample+21] d
 336         fstp    dword [edi + (21*4)]            ; autoc[21]+=d*data[sample+21]  ST = d
 337         fld     st0                             ; ST = d d
 338         fmul    dword [esi + (20*4)]            ; ST = d*data[sample+20] d
 339         fadd    dword [edi + (20*4)]            ; ST = autoc[20]+d*data[sample+20] d
 340         fstp    dword [edi + (20*4)]            ; autoc[20]+=d*data[sample+20]  ST = d
 341         fld     st0                             ; ST = d d
 342         fmul    dword [esi + (19*4)]            ; ST = d*data[sample+19] d
 343         fadd    dword [edi + (19*4)]            ; ST = autoc[19]+d*data[sample+19] d
 344         fstp    dword [edi + (19*4)]            ; autoc[19]+=d*data[sample+19]  ST = d
 345         fld     st0                             ; ST = d d
 346         fmul    dword [esi + (18*4)]            ; ST = d*data[sample+18] d
 347         fadd    dword [edi + (18*4)]            ; ST = autoc[18]+d*data[sample+18] d
 348         fstp    dword [edi + (18*4)]            ; autoc[18]+=d*data[sample+18]  ST = d
 349         fld     st0                             ; ST = d d
 350         fmul    dword [esi + (17*4)]            ; ST = d*data[sample+17] d
 351         fadd    dword [edi + (17*4)]            ; ST = autoc[17]+d*data[sample+17] d
 352         fstp    dword [edi + (17*4)]            ; autoc[17]+=d*data[sample+17]  ST = d
 353         fld     st0                             ; ST = d d
 354         fmul    dword [esi + (16*4)]            ; ST = d*data[sample+16] d
 355         fadd    dword [edi + (16*4)]            ; ST = autoc[16]+d*data[sample+16] d
 356         fstp    dword [edi + (16*4)]            ; autoc[16]+=d*data[sample+16]  ST = d
 357         fld     st0                             ; ST = d d
 358         fmul    dword [esi + (15*4)]            ; ST = d*data[sample+15] d
 359         fadd    dword [edi + (15*4)]            ; ST = autoc[15]+d*data[sample+15] d
 360         fstp    dword [edi + (15*4)]            ; autoc[15]+=d*data[sample+15]  ST = d
 361         fld     st0                             ; ST = d d
 362         fmul    dword [esi + (14*4)]            ; ST = d*data[sample+14] d
 363         fadd    dword [edi + (14*4)]            ; ST = autoc[14]+d*data[sample+14] d
 364         fstp    dword [edi + (14*4)]            ; autoc[14]+=d*data[sample+14]  ST = d
 365         fld     st0                             ; ST = d d
 366         fmul    dword [esi + (13*4)]            ; ST = d*data[sample+13] d
 367         fadd    dword [edi + (13*4)]            ; ST = autoc[13]+d*data[sample+13] d
 368         fstp    dword [edi + (13*4)]            ; autoc[13]+=d*data[sample+13]  ST = d
 369         fld     st0                             ; ST = d d
 370         fmul    dword [esi + (12*4)]            ; ST = d*data[sample+12] d
 371         fadd    dword [edi + (12*4)]            ; ST = autoc[12]+d*data[sample+12] d
 372         fstp    dword [edi + (12*4)]            ; autoc[12]+=d*data[sample+12]  ST = d
 373         fld     st0                             ; ST = d d
 374         fmul    dword [esi + (11*4)]            ; ST = d*data[sample+11] d
 375         fadd    dword [edi + (11*4)]            ; ST = autoc[11]+d*data[sample+11] d
 376         fstp    dword [edi + (11*4)]            ; autoc[11]+=d*data[sample+11]  ST = d
 377         fld     st0                             ; ST = d d
 378         fmul    dword [esi + (10*4)]            ; ST = d*data[sample+10] d
 379         fadd    dword [edi + (10*4)]            ; ST = autoc[10]+d*data[sample+10] d
 380         fstp    dword [edi + (10*4)]            ; autoc[10]+=d*data[sample+10]  ST = d
 381         fld     st0                             ; ST = d d
 382         fmul    dword [esi + ( 9*4)]            ; ST = d*data[sample+9] d
 383         fadd    dword [edi + ( 9*4)]            ; ST = autoc[9]+d*data[sample+9] d
 384         fstp    dword [edi + ( 9*4)]            ; autoc[9]+=d*data[sample+9]  ST = d
 385         fld     st0                             ; ST = d d
 386         fmul    dword [esi + ( 8*4)]            ; ST = d*data[sample+8] d
 387         fadd    dword [edi + ( 8*4)]            ; ST = autoc[8]+d*data[sample+8] d
 388         fstp    dword [edi + ( 8*4)]            ; autoc[8]+=d*data[sample+8]  ST = d
 389         fld     st0                             ; ST = d d
 390         fmul    dword [esi + ( 7*4)]            ; ST = d*data[sample+7] d
 391         fadd    dword [edi + ( 7*4)]            ; ST = autoc[7]+d*data[sample+7] d
 392         fstp    dword [edi + ( 7*4)]            ; autoc[7]+=d*data[sample+7]  ST = d
 393         fld     st0                             ; ST = d d
 394         fmul    dword [esi + ( 6*4)]            ; ST = d*data[sample+6] d
 395         fadd    dword [edi + ( 6*4)]            ; ST = autoc[6]+d*data[sample+6] d
 396         fstp    dword [edi + ( 6*4)]            ; autoc[6]+=d*data[sample+6]  ST = d
 397         fld     st0                             ; ST = d d
 398         fmul    dword [esi + ( 5*4)]            ; ST = d*data[sample+4] d
 399         fadd    dword [edi + ( 5*4)]            ; ST = autoc[4]+d*data[sample+4] d
 400         fstp    dword [edi + ( 5*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 401         fld     st0                             ; ST = d d
 402         fmul    dword [esi + ( 4*4)]            ; ST = d*data[sample+4] d
 403         fadd    dword [edi + ( 4*4)]            ; ST = autoc[4]+d*data[sample+4] d
 404         fstp    dword [edi + ( 4*4)]            ; autoc[4]+=d*data[sample+4]  ST = d
 405         fld     st0                             ; ST = d d
 406         fmul    dword [esi + ( 3*4)]            ; ST = d*data[sample+3] d
 407         fadd    dword [edi + ( 3*4)]            ; ST = autoc[3]+d*data[sample+3] d
 408         fstp    dword [edi + ( 3*4)]            ; autoc[3]+=d*data[sample+3]  ST = d
 409         fld     st0                             ; ST = d d
 410         fmul    dword [esi + ( 2*4)]            ; ST = d*data[sample+2] d
 411         fadd    dword [edi + ( 2*4)]            ; ST = autoc[2]+d*data[sample+2] d
 412         fstp    dword [edi + ( 2*4)]            ; autoc[2]+=d*data[sample+2]  ST = d
 413         fld     st0                             ; ST = d d
 414         fmul    dword [esi + ( 1*4)]            ; ST = d*data[sample+1] d
 415         fadd    dword [edi + ( 1*4)]            ; ST = autoc[1]+d*data[sample+1] d
 416         fstp    dword [edi + ( 1*4)]            ; autoc[1]+=d*data[sample+1]  ST = d
 417         fld     st0                             ; ST = d d
 418         fmul    dword [esi]                     ; ST = d*data[sample] d                 WATCHOUT: no displacement byte here!
 419         fadd    dword [edi]                     ; ST = autoc[0]+d*data[sample] d        WATCHOUT: no displacement byte here!
 420         fstp    dword [edi]                     ; autoc[0]+=d*data[sample]  ST = d      WATCHOUT: no displacement byte here!
 421 .jumper2_0:
 422
 423         fstp    st0                             ; pop d, ST = empty
 424         add     esi, byte 4                     ; sample++
 425         dec     ecx
 426         jz      .loop2_end
 427         add     edx, byte 11                    ; adjust our inner loop counter by adjusting the jump target
 428         fld     dword [esi]                     ; ST = d <- data[sample]
 429         jmp     edx
 430 .loop2_end:
 431
 432 .end:
 433         pop     ebx
 434         pop     edi
 435         pop     esi
 436         ret
 437
 438         ALIGN 16
 439 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
 440         ;[esp + 16] == autoc[]
 441         ;[esp + 12] == lag
 442         ;[esp + 8] == data_len
 443         ;[esp + 4] == data[]
 444
 445         ;ASSERT(lag > 0)
 446         ;ASSERT(lag <= 4)
 447         ;ASSERT(lag <= data_len)
 448
 449         ;       for(coeff = 0; coeff < lag; coeff++)
 450         ;               autoc[coeff] = 0.0;
 451         xorps   xmm5, xmm5
 452
 453         mov     edx, [esp + 8]                  ; edx == data_len
 454         mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
 455
 456         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 457         add     eax, 4
 458         movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
 459         shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 460 .warmup:                                        ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
 461         mulps   xmm0, xmm2                      ; xmm0 = xmm0 * xmm2
 462         addps   xmm5, xmm0                      ; xmm5 += xmm0 * xmm2
 463         dec     edx
 464         jz      .loop_end
 465         ALIGN 16
 466 .loop_start:
 467         ; start by reading the next sample
 468         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
 469         add     eax, 4
 470         shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
 471         shufps  xmm2, xmm2, 93h                 ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
 472         movss   xmm2, xmm0
 473         mulps   xmm0, xmm2                      ; xmm0 = xmm0 * xmm2
 474         addps   xmm5, xmm0                      ; xmm5 += xmm0 * xmm2
 475         dec     edx
 476         jnz     .loop_start
 477 .loop_end:
 478         ; store autoc
 479         mov     edx, [esp + 16]                 ; edx == autoc
 480         movups  [edx], xmm5
 481
 482 .end:
 483         ret
 484
 485         ALIGN 16
 486 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
 487         ;[esp + 16] == autoc[]
 488         ;[esp + 12] == lag
 489         ;[esp + 8] == data_len
 490         ;[esp + 4] == data[]
 491
 492         ;ASSERT(lag > 0)
 493         ;ASSERT(lag <= 8)
 494         ;ASSERT(lag <= data_len)
 495
 496         ;       for(coeff = 0; coeff < lag; coeff++)
 497         ;               autoc[coeff] = 0.0;
 498         xorps   xmm5, xmm5
 499         xorps   xmm6, xmm6
 500
 501         mov     edx, [esp + 8]                  ; edx == data_len
 502         mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
 503
 504         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 505         add     eax, 4
 506         movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
 507         shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 508         movaps  xmm1, xmm0                      ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 509         xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
 510 .warmup:                                        ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
 511         mulps   xmm0, xmm2
 512         mulps   xmm1, xmm3                      ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
 513         addps   xmm5, xmm0
 514         addps   xmm6, xmm1                      ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
 515         dec     edx
 516         jz      .loop_end
 517         ALIGN 16
 518 .loop_start:
 519         ; start by reading the next sample
 520         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
 521         ; here we reorder the instructions; see the (#) indexes for a logical order
 522         shufps  xmm2, xmm2, 93h                 ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
 523         add     eax, 4                          ; (0)
 524         shufps  xmm3, xmm3, 93h                 ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
 525         shufps  xmm0, xmm0, 0                   ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
 526         movss   xmm3, xmm2                      ; (5)
 527         movaps  xmm1, xmm0                      ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
 528         movss   xmm2, xmm0                      ; (6)
 529         mulps   xmm1, xmm3                      ; (8)
 530         mulps   xmm0, xmm2                      ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
 531         addps   xmm6, xmm1                      ; (10)
 532         addps   xmm5, xmm0                      ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
 533         dec     edx
 534         jnz     .loop_start
 535 .loop_end:
 536         ; store autoc
 537         mov     edx, [esp + 16]                 ; edx == autoc
 538         movups  [edx], xmm5
 539         movups  [edx + 16], xmm6
 540
 541 .end:
 542         ret
 543
 544         ALIGN 16
 545 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
 546         ;[esp + 16] == autoc[]
 547         ;[esp + 12] == lag
 548         ;[esp + 8] == data_len
 549         ;[esp + 4] == data[]
 550
 551         ;ASSERT(lag > 0)
 552         ;ASSERT(lag <= 12)
 553         ;ASSERT(lag <= data_len)
 554
 555         ;       for(coeff = 0; coeff < lag; coeff++)
 556         ;               autoc[coeff] = 0.0;
 557         xorps   xmm5, xmm5
 558         xorps   xmm6, xmm6
 559         xorps   xmm7, xmm7
 560
 561         mov     edx, [esp + 8]                  ; edx == data_len
 562         mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
 563
 564         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
 565         add     eax, 4
 566         movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
 567         shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
 568         xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
 569         xorps   xmm4, xmm4                      ; xmm4 = 0,0,0,0
 570 .warmup:                                        ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
 571         movaps  xmm1, xmm0
 572         mulps   xmm1, xmm2
 573         addps   xmm5, xmm1
 574         movaps  xmm1, xmm0
 575         mulps   xmm1, xmm3
 576         addps   xmm6, xmm1
 577         mulps   xmm0, xmm4
 578         addps   xmm7, xmm0                      ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
 579         dec     edx
 580         jz      .loop_end
 581         ALIGN 16
 582 .loop_start:
 583         ; start by reading the next sample
 584         movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
 585         add     eax, 4
 586         shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
 587
 588         ; shift xmm4:xmm3:xmm2 left by one float
 589         shufps  xmm2, xmm2, 93h                 ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
 590         shufps  xmm3, xmm3, 93h                 ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
 591         shufps  xmm4, xmm4, 93h                 ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
 592         movss   xmm4, xmm3
 593         movss   xmm3, xmm2
 594         movss   xmm2, xmm0
 595
 596         ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
 597         movaps  xmm1, xmm0
 598         mulps   xmm1, xmm2
 599         addps   xmm5, xmm1
 600         movaps  xmm1, xmm0
 601         mulps   xmm1, xmm3
 602         addps   xmm6, xmm1
 603         mulps   xmm0, xmm4
 604         addps   xmm7, xmm0
 605
 606         dec     edx
 607         jnz     .loop_start
 608 .loop_end:
 609         ; store autoc
 610         mov     edx, [esp + 16]                 ; edx == autoc
 611         movups  [edx], xmm5
 612         movups  [edx + 16], xmm6
 613         movups  [edx + 32], xmm7
 614
 615 .end:
 616         ret
 617
 618         align 16
 619 cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
 620         ;[ebp + 32] autoc
 621         ;[ebp + 28] lag
 622         ;[ebp + 24] data_len
 623         ;[ebp + 20] data
 624
 625         push    ebp
 626         push    ebx
 627         push    esi
 628         push    edi
 629         mov     ebp, esp
 630
 631         mov     esi, [ebp + 20]
 632         mov     edi, [ebp + 24]
 633         mov     edx, [ebp + 28]
 634         inc     edx
 635         and     edx, byte -2
 636         mov     eax, edx
 637         neg     eax
 638         and     esp, byte -8
 639         lea     esp, [esp + 4 * eax]
 640         mov     ecx, edx
 641         xor     eax, eax
 642 .loop0:
 643         dec     ecx
 644         mov     [esp + 4 * ecx], eax
 645         jnz     short .loop0
 646
 647         mov     eax, edi
 648         sub     eax, edx
 649         mov     ebx, edx
 650         and     ebx, byte 1
 651         sub     eax, ebx
 652         lea     ecx, [esi + 4 * eax - 12]
 653         cmp     esi, ecx
 654         mov     eax, esi
 655         ja      short .loop2_pre
 656         align   16              ;4 nops
 657 .loop1_i:
 658         movd    mm0, [eax]
 659         movd    mm2, [eax + 4]
 660         movd    mm4, [eax + 8]
 661         movd    mm6, [eax + 12]
 662         mov     ebx, edx
 663         punpckldq       mm0, mm0
 664         punpckldq       mm2, mm2
 665         punpckldq       mm4, mm4
 666         punpckldq       mm6, mm6
 667         align   16              ;3 nops
 668 .loop1_j:
 669         sub     ebx, byte 2
 670         movd    mm1, [eax + 4 * ebx]
 671         movd    mm3, [eax + 4 * ebx + 4]
 672         movd    mm5, [eax + 4 * ebx + 8]
 673         movd    mm7, [eax + 4 * ebx + 12]
 674         punpckldq       mm1, mm3
 675         punpckldq       mm3, mm5
 676         pfmul   mm1, mm0
 677         punpckldq       mm5, mm7
 678         pfmul   mm3, mm2
 679         punpckldq       mm7, [eax + 4 * ebx + 16]
 680         pfmul   mm5, mm4
 681         pfmul   mm7, mm6
 682         pfadd   mm1, mm3
 683         movq    mm3, [esp + 4 * ebx]
 684         pfadd   mm5, mm7
 685         pfadd   mm1, mm5
 686         pfadd   mm3, mm1
 687         movq    [esp + 4 * ebx], mm3
 688         jg      short .loop1_j
 689
 690         add     eax, byte 16
 691         cmp     eax, ecx
 692         jb      short .loop1_i
 693
 694 .loop2_pre:
 695         mov     ebx, eax
 696         sub     eax, esi
 697         shr     eax, 2
 698         lea     ecx, [esi + 4 * edi]
 699         mov     esi, ebx
 700 .loop2_i:
 701         movd    mm0, [esi]
 702         mov     ebx, edi
 703         sub     ebx, eax
 704         cmp     ebx, edx
 705         jbe     short .loop2_j
 706         mov     ebx, edx
 707 .loop2_j:
 708         dec     ebx
 709         movd    mm1, [esi + 4 * ebx]
 710         pfmul   mm1, mm0
 711         movd    mm2, [esp + 4 * ebx]
 712         pfadd   mm1, mm2
 713         movd    [esp + 4 * ebx], mm1
 714
 715         jnz     short .loop2_j
 716
 717         add     esi, byte 4
 718         inc     eax
 719         cmp     esi, ecx
 720         jnz     short .loop2_i
 721
 722         mov     edi, [ebp + 32]
 723         mov     edx, [ebp + 28]
 724 .loop3:
 725         dec     edx
 726         mov     eax, [esp + 4 * edx]
 727         mov     [edi + 4 * edx], eax
 728         jnz     short .loop3
 729
 730         femms
 731
 732         mov     esp, ebp
 733         pop     edi
 734         pop     esi
 735         pop     ebx
 736         pop     ebp
 737         ret
 738
 739 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 740 ;
 741 ;       for(i = 0; i < data_len; i++) {
 742 ;               sum = 0;
 743 ;               for(j = 0; j < order; j++)
 744 ;                       sum += qlp_coeff[j] * data[i-j-1];
 745 ;               residual[i] = data[i] - (sum >> lp_quantization);
 746 ;       }
 747 ;
 748         ALIGN   16
 749 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 750         ;[esp + 40]     residual[]
 751         ;[esp + 36]     lp_quantization
 752         ;[esp + 32]     order
 753         ;[esp + 28]     qlp_coeff[]
 754         ;[esp + 24]     data_len
 755         ;[esp + 20]     data[]
 756
 757         ;ASSERT(order > 0)
 758
 759         push    ebp
 760         push    ebx
 761         push    esi
 762         push    edi
 763
 764         mov     esi, [esp + 20]                 ; esi = data[]
 765         mov     edi, [esp + 40]                 ; edi = residual[]
 766         mov     eax, [esp + 32]                 ; eax = order
 767         mov     ebx, [esp + 24]                 ; ebx = data_len
 768
 769         test    ebx, ebx
 770         jz      near .end                       ; do nothing if data_len == 0
 771 .begin:
 772         cmp     eax, byte 1
 773         jg      short .i_1more
 774
 775         mov     ecx, [esp + 28]
 776         mov     edx, [ecx]                      ; edx = qlp_coeff[0]
 777         mov     eax, [esi - 4]                  ; eax = data[-1]
 778         mov     cl, [esp + 36]                  ; cl = lp_quantization
 779         ALIGN   16
 780 .i_1_loop_i:
 781         imul    eax, edx
 782         sar     eax, cl
 783         neg     eax
 784         add     eax, [esi]
 785         mov     [edi], eax
 786         mov     eax, [esi]
 787         add     edi, byte 4
 788         add     esi, byte 4
 789         dec     ebx
 790         jnz     .i_1_loop_i
 791
 792         jmp     .end
 793
 794 .i_1more:
 795         cmp     eax, byte 32                    ; for order <= 32 there is a faster routine
 796         jbe     short .i_32
 797
 798         ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
 799         ALIGN 16
 800 .i_32more_loop_i:
 801         xor     ebp, ebp
 802         mov     ecx, [esp + 32]
 803         mov     edx, ecx
 804         shl     edx, 2
 805         add     edx, [esp + 28]
 806         neg     ecx
 807         ALIGN   16
 808 .i_32more_loop_j:
 809         sub     edx, byte 4
 810         mov     eax, [edx]
 811         imul    eax, [esi + 4 * ecx]
 812         add     ebp, eax
 813         inc     ecx
 814         jnz     short .i_32more_loop_j
 815
 816         mov     cl, [esp + 36]
 817         sar     ebp, cl
 818         neg     ebp
 819         add     ebp, [esi]
 820         mov     [edi], ebp
 821         add     esi, byte 4
 822         add     edi, byte 4
 823
 824         dec     ebx
 825         jnz     .i_32more_loop_i
 826
 827         jmp     .end
 828
 829 .i_32:
 830         sub     edi, esi
 831         neg     eax
 832         lea     edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
 833         call    .get_eip0
 834 .get_eip0:
 835         pop     eax
 836         add     edx, eax
 837         inc     edx
 838         mov     eax, [esp + 28]                 ; eax = qlp_coeff[]
 839         xor     ebp, ebp
 840         jmp     edx
 841
 842         mov     ecx, [eax + 124]
 843         imul    ecx, [esi - 128]
 844         add     ebp, ecx
 845         mov     ecx, [eax + 120]
 846         imul    ecx, [esi - 124]
 847         add     ebp, ecx
 848         mov     ecx, [eax + 116]
 849         imul    ecx, [esi - 120]
 850         add     ebp, ecx
 851         mov     ecx, [eax + 112]
 852         imul    ecx, [esi - 116]
 853         add     ebp, ecx
 854         mov     ecx, [eax + 108]
 855         imul    ecx, [esi - 112]
 856         add     ebp, ecx
 857         mov     ecx, [eax + 104]
 858         imul    ecx, [esi - 108]
 859         add     ebp, ecx
 860         mov     ecx, [eax + 100]
 861         imul    ecx, [esi - 104]
 862         add     ebp, ecx
 863         mov     ecx, [eax + 96]
 864         imul    ecx, [esi - 100]
 865         add     ebp, ecx
 866         mov     ecx, [eax + 92]
 867         imul    ecx, [esi - 96]
 868         add     ebp, ecx
 869         mov     ecx, [eax + 88]
 870         imul    ecx, [esi - 92]
 871         add     ebp, ecx
 872         mov     ecx, [eax + 84]
 873         imul    ecx, [esi - 88]
 874         add     ebp, ecx
 875         mov     ecx, [eax + 80]
 876         imul    ecx, [esi - 84]
 877         add     ebp, ecx
 878         mov     ecx, [eax + 76]
 879         imul    ecx, [esi - 80]
 880         add     ebp, ecx
 881         mov     ecx, [eax + 72]
 882         imul    ecx, [esi - 76]
 883         add     ebp, ecx
 884         mov     ecx, [eax + 68]
 885         imul    ecx, [esi - 72]
 886         add     ebp, ecx
 887         mov     ecx, [eax + 64]
 888         imul    ecx, [esi - 68]
 889         add     ebp, ecx
 890         mov     ecx, [eax + 60]
 891         imul    ecx, [esi - 64]
 892         add     ebp, ecx
 893         mov     ecx, [eax + 56]
 894         imul    ecx, [esi - 60]
 895         add     ebp, ecx
 896         mov     ecx, [eax + 52]
 897         imul    ecx, [esi - 56]
 898         add     ebp, ecx
 899         mov     ecx, [eax + 48]
 900         imul    ecx, [esi - 52]
 901         add     ebp, ecx
 902         mov     ecx, [eax + 44]
 903         imul    ecx, [esi - 48]
 904         add     ebp, ecx
 905         mov     ecx, [eax + 40]
 906         imul    ecx, [esi - 44]
 907         add     ebp, ecx
 908         mov     ecx, [eax + 36]
 909         imul    ecx, [esi - 40]
 910         add     ebp, ecx
 911         mov     ecx, [eax + 32]
 912         imul    ecx, [esi - 36]
 913         add     ebp, ecx
 914         mov     ecx, [eax + 28]
 915         imul    ecx, [esi - 32]
 916         add     ebp, ecx
 917         mov     ecx, [eax + 24]
 918         imul    ecx, [esi - 28]
 919         add     ebp, ecx
 920         mov     ecx, [eax + 20]
 921         imul    ecx, [esi - 24]
 922         add     ebp, ecx
 923         mov     ecx, [eax + 16]
 924         imul    ecx, [esi - 20]
 925         add     ebp, ecx
 926         mov     ecx, [eax + 12]
 927         imul    ecx, [esi - 16]
 928         add     ebp, ecx
 929         mov     ecx, [eax + 8]
 930         imul    ecx, [esi - 12]
 931         add     ebp, ecx
 932         mov     ecx, [eax + 4]
 933         imul    ecx, [esi - 8]
 934         add     ebp, ecx
 935         mov     ecx, [eax]                      ; there is one byte missing
 936         imul    ecx, [esi - 4]
 937         add     ebp, ecx
 938 .jumper_0:
 939
 940         mov     cl, [esp + 36]
 941         sar     ebp, cl
 942         neg     ebp
 943         add     ebp, [esi]
 944         mov     [edi + esi], ebp
 945         add     esi, byte 4
 946
 947         dec     ebx
 948         jz      short .end
 949         xor     ebp, ebp
 950         jmp     edx
 951
 952 .end:
 953         pop     edi
 954         pop     esi
 955         pop     ebx
 956         pop     ebp
 957         ret
 958
 959 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
 960 ; the channel must be <= 16.  Especially note that this routine cannot be used
 961 ; for side-channel coded 16bps channels since the effective bps is 17.
 962         ALIGN   16
 963 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 964         ;[esp + 40]     residual[]
 965         ;[esp + 36]     lp_quantization
 966         ;[esp + 32]     order
 967         ;[esp + 28]     qlp_coeff[]
 968         ;[esp + 24]     data_len
 969         ;[esp + 20]     data[]
 970
 971         ;ASSERT(order > 0)
 972
 973         push    ebp
 974         push    ebx
 975         push    esi
 976         push    edi
 977
 978         mov     esi, [esp + 20]                 ; esi = data[]
 979         mov     edi, [esp + 40]                 ; edi = residual[]
 980         mov     eax, [esp + 32]                 ; eax = order
 981         mov     ebx, [esp + 24]                 ; ebx = data_len
 982
 983         test    ebx, ebx
 984         jz      near .end                       ; do nothing if data_len == 0
 985         dec     ebx
 986         test    ebx, ebx
 987         jz      near .last_one
 988
 989         mov     edx, [esp + 28]                 ; edx = qlp_coeff[]
 990         movd    mm6, [esp + 36]                 ; mm6 = 0:lp_quantization
 991         mov     ebp, esp
 992
 993         and     esp, 0xfffffff8
 994
 995         xor     ecx, ecx
 996 .copy_qlp_loop:
 997         push    word [edx + 4 * ecx]
 998         inc     ecx
 999         cmp     ecx, eax
1000         jnz     short .copy_qlp_loop
1001
1002         and     ecx, 0x3
1003         test    ecx, ecx
1004         je      short .za_end
1005         sub     ecx, byte 4
1006 .za_loop:
1007         push    word 0
1008         inc     eax
1009         inc     ecx
1010         jnz     short .za_loop
1011 .za_end:
1012
1013         movq    mm5, [esp + 2 * eax - 8]
1014         movd    mm4, [esi - 16]
1015         punpckldq       mm4, [esi - 12]
1016         movd    mm0, [esi - 8]
1017         punpckldq       mm0, [esi - 4]
1018         packssdw        mm4, mm0
1019
1020         cmp     eax, byte 4
1021         jnbe    short .mmx_4more
1022
1023         align   16
1024 .mmx_4_loop_i:
1025         movd    mm1, [esi]
1026         movq    mm3, mm4
1027         punpckldq       mm1, [esi + 4]
1028         psrlq   mm4, 16
1029         movq    mm0, mm1
1030         psllq   mm0, 48
1031         por     mm4, mm0
1032         movq    mm2, mm4
1033         psrlq   mm4, 16
1034         pxor    mm0, mm0
1035         punpckhdq       mm0, mm1
1036         pmaddwd mm3, mm5
1037         pmaddwd mm2, mm5
1038         psllq   mm0, 16
1039         por     mm4, mm0
1040         movq    mm0, mm3
1041         punpckldq       mm3, mm2
1042         punpckhdq       mm0, mm2
1043         paddd   mm3, mm0
1044         psrad   mm3, mm6
1045         psubd   mm1, mm3
1046         movd    [edi], mm1
1047         punpckhdq       mm1, mm1
1048         movd    [edi + 4], mm1
1049
1050         add     edi, byte 8
1051         add     esi, byte 8
1052
1053         sub     ebx, 2
1054         jg      .mmx_4_loop_i
1055         jmp     .mmx_end
1056
1057 .mmx_4more:
1058         shl     eax, 2
1059         neg     eax
1060         add     eax, byte 16
1061
1062         align   16
1063 .mmx_4more_loop_i:
1064         movd    mm1, [esi]
1065         punpckldq       mm1, [esi + 4]
1066         movq    mm3, mm4
1067         psrlq   mm4, 16
1068         movq    mm0, mm1
1069         psllq   mm0, 48
1070         por     mm4, mm0
1071         movq    mm2, mm4
1072         psrlq   mm4, 16
1073         pxor    mm0, mm0
1074         punpckhdq       mm0, mm1
1075         pmaddwd mm3, mm5
1076         pmaddwd mm2, mm5
1077         psllq   mm0, 16
1078         por     mm4, mm0
1079
1080         mov     ecx, esi
1081         add     ecx, eax
1082         mov     edx, esp
1083
1084         align   16
1085 .mmx_4more_loop_j:
1086         movd    mm0, [ecx - 16]
1087         movd    mm7, [ecx - 8]
1088         punpckldq       mm0, [ecx - 12]
1089         punpckldq       mm7, [ecx - 4]
1090         packssdw        mm0, mm7
1091         pmaddwd mm0, [edx]
1092         punpckhdq       mm7, mm7
1093         paddd   mm3, mm0
1094         movd    mm0, [ecx - 12]
1095         punpckldq       mm0, [ecx - 8]
1096         punpckldq       mm7, [ecx]
1097         packssdw        mm0, mm7
1098         pmaddwd mm0, [edx]
1099         paddd   mm2, mm0
1100
1101         add     edx, byte 8
1102         add     ecx, byte 16
1103         cmp     ecx, esi
1104         jnz     .mmx_4more_loop_j
1105
1106         movq    mm0, mm3
1107         punpckldq       mm3, mm2
1108         punpckhdq       mm0, mm2
1109         paddd   mm3, mm0
1110         psrad   mm3, mm6
1111         psubd   mm1, mm3
1112         movd    [edi], mm1
1113         punpckhdq       mm1, mm1
1114         movd    [edi + 4], mm1
1115
1116         add     edi, byte 8
1117         add     esi, byte 8
1118
1119         sub     ebx, 2
1120         jg      near .mmx_4more_loop_i
1121
1122 .mmx_end:
1123         emms
1124         mov     esp, ebp
1125 .last_one:
1126         mov     eax, [esp + 32]
1127         inc     ebx
1128         jnz     near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1129
1130 .end:
1131         pop     edi
1132         pop     esi
1133         pop     ebx
1134         pop     ebp
1135         ret
1136
1137 ; **********************************************************************
1138 ;
1139 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1140 ; {
1141 ;       unsigned i, j;
1142 ;       FLAC__int32 sum;
1143 ;
1144 ;       FLAC__ASSERT(order > 0);
1145 ;
1146 ;       for(i = 0; i < data_len; i++) {
1147 ;               sum = 0;
1148 ;               for(j = 0; j < order; j++)
1149 ;                       sum += qlp_coeff[j] * data[i-j-1];
1150 ;               data[i] = residual[i] + (sum >> lp_quantization);
1151 ;       }
1152 ; }
1153         ALIGN   16
1154 cident FLAC__lpc_restore_signal_asm_ia32
1155         ;[esp + 40]     data[]
1156         ;[esp + 36]     lp_quantization
1157         ;[esp + 32]     order
1158         ;[esp + 28]     qlp_coeff[]
1159         ;[esp + 24]     data_len
1160         ;[esp + 20]     residual[]
1161
1162         ;ASSERT(order > 0)
1163
1164         push    ebp
1165         push    ebx
1166         push    esi
1167         push    edi
1168
1169         mov     esi, [esp + 20]                 ; esi = residual[]
1170         mov     edi, [esp + 40]                 ; edi = data[]
1171         mov     eax, [esp + 32]                 ; eax = order
1172         mov     ebx, [esp + 24]                 ; ebx = data_len
1173
1174         test    ebx, ebx
1175         jz      near .end                       ; do nothing if data_len == 0
1176
1177 .begin:
1178         cmp     eax, byte 1
1179         jg      short .x87_1more
1180
1181         mov     ecx, [esp + 28]
1182         mov     edx, [ecx]
1183         mov     eax, [edi - 4]
1184         mov     cl, [esp + 36]
1185         ALIGN   16
1186 .x87_1_loop_i:
1187         imul    eax, edx
1188         sar     eax, cl
1189         add     eax, [esi]
1190         mov     [edi], eax
1191         add     esi, byte 4
1192         add     edi, byte 4
1193         dec     ebx
1194         jnz     .x87_1_loop_i
1195
1196         jmp     .end
1197
1198 .x87_1more:
1199         cmp     eax, byte 32                    ; for order <= 32 there is a faster routine
1200         jbe     short .x87_32
1201
1202         ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1203         ALIGN 16
1204 .x87_32more_loop_i:
1205         xor     ebp, ebp
1206         mov     ecx, [esp + 32]
1207         mov     edx, ecx
1208         shl     edx, 2
1209         add     edx, [esp + 28]
1210         neg     ecx
1211         ALIGN   16
1212 .x87_32more_loop_j:
1213         sub     edx, byte 4
1214         mov     eax, [edx]
1215         imul    eax, [edi + 4 * ecx]
1216         add     ebp, eax
1217         inc     ecx
1218         jnz     short .x87_32more_loop_j
1219
1220         mov     cl, [esp + 36]
1221         sar     ebp, cl
1222         add     ebp, [esi]
1223         mov     [edi], ebp
1224         add     edi, byte 4
1225         add     esi, byte 4
1226
1227         dec     ebx
1228         jnz     .x87_32more_loop_i
1229
1230         jmp     .end
1231
1232 .x87_32:
1233         sub     esi, edi
1234         neg     eax
1235         lea     edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1236         call    .get_eip0
1237 .get_eip0:
1238         pop     eax
1239         add     edx, eax
1240         inc     edx                             ; compensate for the shorter opcode on the last iteration
1241         mov     eax, [esp + 28]                 ; eax = qlp_coeff[]
1242         xor     ebp, ebp
1243         jmp     edx
1244
1245         mov     ecx, [eax + 124]                ; ecx =  qlp_coeff[31]
1246         imul    ecx, [edi - 128]                ; ecx =  qlp_coeff[31] * data[i-32]
1247         add     ebp, ecx                        ; sum += qlp_coeff[31] * data[i-32]
1248         mov     ecx, [eax + 120]                ; ecx =  qlp_coeff[30]
1249         imul    ecx, [edi - 124]                ; ecx =  qlp_coeff[30] * data[i-31]
1250         add     ebp, ecx                        ; sum += qlp_coeff[30] * data[i-31]
1251         mov     ecx, [eax + 116]                ; ecx =  qlp_coeff[29]
1252         imul    ecx, [edi - 120]                ; ecx =  qlp_coeff[29] * data[i-30]
1253         add     ebp, ecx                        ; sum += qlp_coeff[29] * data[i-30]
1254         mov     ecx, [eax + 112]                ; ecx =  qlp_coeff[28]
1255         imul    ecx, [edi - 116]                ; ecx =  qlp_coeff[28] * data[i-29]
1256         add     ebp, ecx                        ; sum += qlp_coeff[28] * data[i-29]
1257         mov     ecx, [eax + 108]                ; ecx =  qlp_coeff[27]
1258         imul    ecx, [edi - 112]                ; ecx =  qlp_coeff[27] * data[i-28]
1259         add     ebp, ecx                        ; sum += qlp_coeff[27] * data[i-28]
1260         mov     ecx, [eax + 104]                ; ecx =  qlp_coeff[26]
1261         imul    ecx, [edi - 108]                ; ecx =  qlp_coeff[26] * data[i-27]
1262         add     ebp, ecx                        ; sum += qlp_coeff[26] * data[i-27]
1263         mov     ecx, [eax + 100]                ; ecx =  qlp_coeff[25]
1264         imul    ecx, [edi - 104]                ; ecx =  qlp_coeff[25] * data[i-26]
1265         add     ebp, ecx                        ; sum += qlp_coeff[25] * data[i-26]
1266         mov     ecx, [eax + 96]                 ; ecx =  qlp_coeff[24]
1267         imul    ecx, [edi - 100]                ; ecx =  qlp_coeff[24] * data[i-25]
1268         add     ebp, ecx                        ; sum += qlp_coeff[24] * data[i-25]
1269         mov     ecx, [eax + 92]                 ; ecx =  qlp_coeff[23]
1270         imul    ecx, [edi - 96]                 ; ecx =  qlp_coeff[23] * data[i-24]
1271         add     ebp, ecx                        ; sum += qlp_coeff[23] * data[i-24]
1272         mov     ecx, [eax + 88]                 ; ecx =  qlp_coeff[22]
1273         imul    ecx, [edi - 92]                 ; ecx =  qlp_coeff[22] * data[i-23]
1274         add     ebp, ecx                        ; sum += qlp_coeff[22] * data[i-23]
1275         mov     ecx, [eax + 84]                 ; ecx =  qlp_coeff[21]
1276         imul    ecx, [edi - 88]                 ; ecx =  qlp_coeff[21] * data[i-22]
1277         add     ebp, ecx                        ; sum += qlp_coeff[21] * data[i-22]
1278         mov     ecx, [eax + 80]                 ; ecx =  qlp_coeff[20]
1279         imul    ecx, [edi - 84]                 ; ecx =  qlp_coeff[20] * data[i-21]
1280         add     ebp, ecx                        ; sum += qlp_coeff[20] * data[i-21]
1281         mov     ecx, [eax + 76]                 ; ecx =  qlp_coeff[19]
1282         imul    ecx, [edi - 80]                 ; ecx =  qlp_coeff[19] * data[i-20]
1283         add     ebp, ecx                        ; sum += qlp_coeff[19] * data[i-20]
1284         mov     ecx, [eax + 72]                 ; ecx =  qlp_coeff[18]
1285         imul    ecx, [edi - 76]                 ; ecx =  qlp_coeff[18] * data[i-19]
1286         add     ebp, ecx                        ; sum += qlp_coeff[18] * data[i-19]
1287         mov     ecx, [eax + 68]                 ; ecx =  qlp_coeff[17]
1288         imul    ecx, [edi - 72]                 ; ecx =  qlp_coeff[17] * data[i-18]
1289         add     ebp, ecx                        ; sum += qlp_coeff[17] * data[i-18]
1290         mov     ecx, [eax + 64]                 ; ecx =  qlp_coeff[16]
1291         imul    ecx, [edi - 68]                 ; ecx =  qlp_coeff[16] * data[i-17]
1292         add     ebp, ecx                        ; sum += qlp_coeff[16] * data[i-17]
1293         mov     ecx, [eax + 60]                 ; ecx =  qlp_coeff[15]
1294         imul    ecx, [edi - 64]                 ; ecx =  qlp_coeff[15] * data[i-16]
1295         add     ebp, ecx                        ; sum += qlp_coeff[15] * data[i-16]
1296         mov     ecx, [eax + 56]                 ; ecx =  qlp_coeff[14]
1297         imul    ecx, [edi - 60]                 ; ecx =  qlp_coeff[14] * data[i-15]
1298         add     ebp, ecx                        ; sum += qlp_coeff[14] * data[i-15]
1299         mov     ecx, [eax + 52]                 ; ecx =  qlp_coeff[13]
1300         imul    ecx, [edi - 56]                 ; ecx =  qlp_coeff[13] * data[i-14]
1301         add     ebp, ecx                        ; sum += qlp_coeff[13] * data[i-14]
1302         mov     ecx, [eax + 48]                 ; ecx =  qlp_coeff[12]
1303         imul    ecx, [edi - 52]                 ; ecx =  qlp_coeff[12] * data[i-13]
1304         add     ebp, ecx                        ; sum += qlp_coeff[12] * data[i-13]
1305         mov     ecx, [eax + 44]                 ; ecx =  qlp_coeff[11]
1306         imul    ecx, [edi - 48]                 ; ecx =  qlp_coeff[11] * data[i-12]
1307         add     ebp, ecx                        ; sum += qlp_coeff[11] * data[i-12]
1308         mov     ecx, [eax + 40]                 ; ecx =  qlp_coeff[10]
1309         imul    ecx, [edi - 44]                 ; ecx =  qlp_coeff[10] * data[i-11]
1310         add     ebp, ecx                        ; sum += qlp_coeff[10] * data[i-11]
1311         mov     ecx, [eax + 36]                 ; ecx =  qlp_coeff[ 9]
1312         imul    ecx, [edi - 40]                 ; ecx =  qlp_coeff[ 9] * data[i-10]
1313         add     ebp, ecx                        ; sum += qlp_coeff[ 9] * data[i-10]
1314         mov     ecx, [eax + 32]                 ; ecx =  qlp_coeff[ 8]
1315         imul    ecx, [edi - 36]                 ; ecx =  qlp_coeff[ 8] * data[i- 9]
1316         add     ebp, ecx                        ; sum += qlp_coeff[ 8] * data[i- 9]
1317         mov     ecx, [eax + 28]                 ; ecx =  qlp_coeff[ 7]
1318         imul    ecx, [edi - 32]                 ; ecx =  qlp_coeff[ 7] * data[i- 8]
1319         add     ebp, ecx                        ; sum += qlp_coeff[ 7] * data[i- 8]
1320         mov     ecx, [eax + 24]                 ; ecx =  qlp_coeff[ 6]
1321         imul    ecx, [edi - 28]                 ; ecx =  qlp_coeff[ 6] * data[i- 7]
1322         add     ebp, ecx                        ; sum += qlp_coeff[ 6] * data[i- 7]
1323         mov     ecx, [eax + 20]                 ; ecx =  qlp_coeff[ 5]
1324         imul    ecx, [edi - 24]                 ; ecx =  qlp_coeff[ 5] * data[i- 6]
1325         add     ebp, ecx                        ; sum += qlp_coeff[ 5] * data[i- 6]
1326         mov     ecx, [eax + 16]                 ; ecx =  qlp_coeff[ 4]
1327         imul    ecx, [edi - 20]                 ; ecx =  qlp_coeff[ 4] * data[i- 5]
1328         add     ebp, ecx                        ; sum += qlp_coeff[ 4] * data[i- 5]
1329         mov     ecx, [eax + 12]                 ; ecx =  qlp_coeff[ 3]
1330         imul    ecx, [edi - 16]                 ; ecx =  qlp_coeff[ 3] * data[i- 4]
1331         add     ebp, ecx                        ; sum += qlp_coeff[ 3] * data[i- 4]
1332         mov     ecx, [eax + 8]                  ; ecx =  qlp_coeff[ 2]
1333         imul    ecx, [edi - 12]                 ; ecx =  qlp_coeff[ 2] * data[i- 3]
1334         add     ebp, ecx                        ; sum += qlp_coeff[ 2] * data[i- 3]
1335         mov     ecx, [eax + 4]                  ; ecx =  qlp_coeff[ 1]
1336         imul    ecx, [edi - 8]                  ; ecx =  qlp_coeff[ 1] * data[i- 2]
1337         add     ebp, ecx                        ; sum += qlp_coeff[ 1] * data[i- 2]
1338         mov     ecx, [eax]                      ; ecx =  qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1339         imul    ecx, [edi - 4]                  ; ecx =  qlp_coeff[ 0] * data[i- 1]
1340         add     ebp, ecx                        ; sum += qlp_coeff[ 0] * data[i- 1]
1341 .jumper_0:
1342
1343         mov     cl, [esp + 36]
1344         sar     ebp, cl                         ; ebp = (sum >> lp_quantization)
1345         add     ebp, [esi + edi]                ; ebp = residual[i] + (sum >> lp_quantization)
1346         mov     [edi], ebp                      ; data[i] = residual[i] + (sum >> lp_quantization)
1347         add     edi, byte 4
1348
1349         dec     ebx
1350         jz      short .end
1351         xor     ebp, ebp
1352         jmp     edx
1353
1354 .end:
1355         pop     edi
1356         pop     esi
1357         pop     ebx
1358         pop     ebp
1359         ret
1360
1361 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1362 ; the channel must be <= 16.  Especially note that this routine cannot be used
1363 ; for side-channel coded 16bps channels since the effective bps is 17.
1364 ; WATCHOUT: this routine requires that each data array have a buffer of up to
1365 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1366 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1367         ALIGN   16
1368 cident FLAC__lpc_restore_signal_asm_ia32_mmx
1369         ;[esp + 40]     data[]
1370         ;[esp + 36]     lp_quantization
1371         ;[esp + 32]     order
1372         ;[esp + 28]     qlp_coeff[]
1373         ;[esp + 24]     data_len
1374         ;[esp + 20]     residual[]
1375
1376         ;ASSERT(order > 0)
1377
1378         push    ebp
1379         push    ebx
1380         push    esi
1381         push    edi
1382
1383         mov     esi, [esp + 20]
1384         mov     edi, [esp + 40]
1385         mov     eax, [esp + 32]
1386         mov     ebx, [esp + 24]
1387
1388         test    ebx, ebx
1389         jz      near .end                       ; do nothing if data_len == 0
1390         cmp     eax, byte 4
1391         jb      near FLAC__lpc_restore_signal_asm_ia32.begin
1392
1393         mov     edx, [esp + 28]
1394         movd    mm6, [esp + 36]
1395         mov     ebp, esp
1396
1397         and     esp, 0xfffffff8
1398
1399         xor     ecx, ecx
1400 .copy_qlp_loop:
1401         push    word [edx + 4 * ecx]
1402         inc     ecx
1403         cmp     ecx, eax
1404         jnz     short .copy_qlp_loop
1405
1406         and     ecx, 0x3
1407         test    ecx, ecx
1408         je      short .za_end
1409         sub     ecx, byte 4
1410 .za_loop:
1411         push    word 0
1412         inc     eax
1413         inc     ecx
1414         jnz     short .za_loop
1415 .za_end:
1416
1417         movq    mm5, [esp + 2 * eax - 8]
1418         movd    mm4, [edi - 16]
1419         punpckldq       mm4, [edi - 12]
1420         movd    mm0, [edi - 8]
1421         punpckldq       mm0, [edi - 4]
1422         packssdw        mm4, mm0
1423
1424         cmp     eax, byte 4
1425         jnbe    short .mmx_4more
1426
1427         align   16
1428 .mmx_4_loop_i:
1429         movq    mm7, mm4
1430         pmaddwd mm7, mm5
1431         movq    mm0, mm7
1432         punpckhdq       mm7, mm7
1433         paddd   mm7, mm0
1434         psrad   mm7, mm6
1435         movd    mm1, [esi]
1436         paddd   mm7, mm1
1437         movd    [edi], mm7
1438         psllq   mm7, 48
1439         psrlq   mm4, 16
1440         por     mm4, mm7
1441
1442         add     esi, byte 4
1443         add     edi, byte 4
1444
1445         dec     ebx
1446         jnz     .mmx_4_loop_i
1447         jmp     .mmx_end
1448 .mmx_4more:
1449         shl     eax, 2
1450         neg     eax
1451         add     eax, byte 16
1452         align   16
1453 .mmx_4more_loop_i:
1454         mov     ecx, edi
1455         add     ecx, eax
1456         mov     edx, esp
1457
1458         movq    mm7, mm4
1459         pmaddwd mm7, mm5
1460
1461         align   16
1462 .mmx_4more_loop_j:
1463         movd    mm0, [ecx - 16]
1464         punpckldq       mm0, [ecx - 12]
1465         movd    mm1, [ecx - 8]
1466         punpckldq       mm1, [ecx - 4]
1467         packssdw        mm0, mm1
1468         pmaddwd mm0, [edx]
1469         paddd   mm7, mm0
1470
1471         add     edx, byte 8
1472         add     ecx, byte 16
1473         cmp     ecx, edi
1474         jnz     .mmx_4more_loop_j
1475
1476         movq    mm0, mm7
1477         punpckhdq       mm7, mm7
1478         paddd   mm7, mm0
1479         psrad   mm7, mm6
1480         movd    mm1, [esi]
1481         paddd   mm7, mm1
1482         movd    [edi], mm7
1483         psllq   mm7, 48
1484         psrlq   mm4, 16
1485         por     mm4, mm7
1486
1487         add     esi, byte 4
1488         add     edi, byte 4
1489
1490         dec     ebx
1491         jnz     short .mmx_4more_loop_i
1492 .mmx_end:
1493         emms
1494         mov     esp, ebp
1495
1496 .end:
1497         pop     edi
1498         pop     esi
1499         pop     ebx
1500         pop     ebp
1501         ret
1502
1503 end