libavcodec/x86/alacdsp.asm

   1 ;******************************************************************************
   2 ;* ALAC DSP SIMD optimizations
   3 ;*
   4 ;* Copyright (C) 2015 James Almer
   5 ;*
   6 ;* This file is part of FFmpeg.
   7 ;*
   8 ;* FFmpeg is free software; you can redistribute it and/or
   9 ;* modify it under the terms of the GNU Lesser General Public
  10 ;* License as published by the Free Software Foundation; either
  11 ;* version 2.1 of the License, or (at your option) any later version.
  12 ;*
  13 ;* FFmpeg is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 ;* Lesser General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU Lesser General Public
  19 ;* License along with FFmpeg; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21 ;******************************************************************************
  22
  23 %include "libavutil/x86/x86util.asm"
  24
  25 SECTION .text
  26
  27 INIT_XMM sse4
  28 %if ARCH_X86_64
  29 cglobal alac_decorrelate_stereo, 2, 5, 8, buf0, len, shift, weight, buf1
  30 %else
  31 cglobal alac_decorrelate_stereo, 2, 3, 8, buf0, len, shift, weight
  32 %define  buf1q  r2q
  33 %endif
  34     movd    m6, shiftm
  35     movd    m7, weightm
  36     SPLATD  m7
  37     shl   lend, 2
  38     mov  buf1q, [buf0q + gprsize]
  39     mov  buf0q, [buf0q]
  40     add  buf1q, lenq
  41     add  buf0q, lenq
  42     neg  lenq
  43
  44 align 16
  45 .loop:
  46     mova    m0, [buf0q + lenq]
  47     mova    m1, [buf0q + lenq + mmsize]
  48     mova    m2, [buf1q + lenq]
  49     mova    m3, [buf1q + lenq + mmsize]
  50     pmulld  m4, m2, m7
  51     pmulld  m5, m3, m7
  52     psrad   m4, m6
  53     psrad   m5, m6
  54     psubd   m0, m4
  55     psubd   m1, m5
  56     paddd   m2, m0
  57     paddd   m3, m1
  58     mova [buf1q + lenq], m0
  59     mova [buf1q + lenq + mmsize], m1
  60     mova [buf0q + lenq], m2
  61     mova [buf0q + lenq + mmsize], m3
  62
  63     add   lenq, mmsize*2
  64     jl .loop
  65     RET
  66
  67 INIT_XMM sse2
  68 cglobal alac_append_extra_bits_stereo, 2, 5, 5, buf0, exbuf0, buf1, exbuf1, len
  69     movifnidn lend, lenm
  70     movd      m4, r2m ; exbits
  71     shl     lend, 2
  72     mov    buf1q, [buf0q + gprsize]
  73     mov    buf0q, [buf0q]
  74     mov  exbuf1q, [exbuf0q + gprsize]
  75     mov  exbuf0q, [exbuf0q]
  76     add    buf1q, lenq
  77     add    buf0q, lenq
  78     add  exbuf1q, lenq
  79     add  exbuf0q, lenq
  80     neg lenq
  81
  82 align 16
  83 .loop:
  84     mova      m0, [buf0q + lenq]
  85     mova      m1, [buf0q + lenq + mmsize]
  86     pslld     m0, m4
  87     pslld     m1, m4
  88     mova      m2, [buf1q + lenq]
  89     mova      m3, [buf1q + lenq + mmsize]
  90     pslld     m2, m4
  91     pslld     m3, m4
  92     por       m0, [exbuf0q + lenq]
  93     por       m1, [exbuf0q + lenq + mmsize]
  94     por       m2, [exbuf1q + lenq]
  95     por       m3, [exbuf1q + lenq + mmsize]
  96     mova [buf0q + lenq         ], m0
  97     mova [buf0q + lenq + mmsize], m1
  98     mova [buf1q + lenq         ], m2
  99     mova [buf1q + lenq + mmsize], m3
 100
 101     add     lenq, mmsize*2
 102     jl .loop
 103     RET
 104
 105 %if ARCH_X86_64
 106 cglobal alac_append_extra_bits_mono, 2, 5, 3, buf, exbuf, exbits, ch, len
 107 %else
 108 cglobal alac_append_extra_bits_mono, 2, 3, 3, buf, exbuf, len
 109 %define exbitsm r2m
 110 %endif
 111     movifnidn lend, r4m
 112     movd     m2, exbitsm
 113     shl    lend, 2
 114     mov    bufq, [bufq]
 115     mov  exbufq, [exbufq]
 116     add    bufq, lenq
 117     add  exbufq, lenq
 118     neg lenq
 119
 120 align 16
 121 .loop:
 122     mova      m0, [bufq + lenq]
 123     mova      m1, [bufq + lenq + mmsize]
 124     pslld     m0, m2
 125     pslld     m1, m2
 126     por       m0, [exbufq + lenq]
 127     por       m1, [exbufq + lenq + mmsize]
 128     mova [bufq + lenq], m0
 129     mova [bufq + lenq + mmsize], m1
 130
 131     add     lenq, mmsize*2
 132     jl .loop
 133     RET