tizen/distrib/libav/libavcodec/x86/fmtconvert_mmx.c

   1 /*
   2  * Format Conversion Utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This file is part of Libav.
   7  *
   8  * Libav is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * Libav is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with Libav; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  *
  22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23  */
  24
  25 #include "libavutil/cpu.h"
  26 #include "libavutil/x86_cpu.h"
  27 #include "libavcodec/fmtconvert.h"
  28
  29 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
  30 {
  31     x86_reg i = -4*len;
  32     __asm__ volatile(
  33         "movss  %3, %%xmm4 \n"
  34         "shufps $0, %%xmm4, %%xmm4 \n"
  35         "1: \n"
  36         "cvtpi2ps   (%2,%0), %%xmm0 \n"
  37         "cvtpi2ps  8(%2,%0), %%xmm1 \n"
  38         "cvtpi2ps 16(%2,%0), %%xmm2 \n"
  39         "cvtpi2ps 24(%2,%0), %%xmm3 \n"
  40         "movlhps  %%xmm1,    %%xmm0 \n"
  41         "movlhps  %%xmm3,    %%xmm2 \n"
  42         "mulps    %%xmm4,    %%xmm0 \n"
  43         "mulps    %%xmm4,    %%xmm2 \n"
  44         "movaps   %%xmm0,   (%1,%0) \n"
  45         "movaps   %%xmm2, 16(%1,%0) \n"
  46         "add $32, %0 \n"
  47         "jl 1b \n"
  48         :"+r"(i)
  49         :"r"(dst+len), "r"(src+len), "m"(mul)
  50     );
  51 }
  52
  53 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
  54 {
  55     x86_reg i = -4*len;
  56     __asm__ volatile(
  57         "movss  %3, %%xmm4 \n"
  58         "shufps $0, %%xmm4, %%xmm4 \n"
  59         "1: \n"
  60         "cvtdq2ps   (%2,%0), %%xmm0 \n"
  61         "cvtdq2ps 16(%2,%0), %%xmm1 \n"
  62         "mulps    %%xmm4,    %%xmm0 \n"
  63         "mulps    %%xmm4,    %%xmm1 \n"
  64         "movaps   %%xmm0,   (%1,%0) \n"
  65         "movaps   %%xmm1, 16(%1,%0) \n"
  66         "add $32, %0 \n"
  67         "jl 1b \n"
  68         :"+r"(i)
  69         :"r"(dst+len), "r"(src+len), "m"(mul)
  70     );
  71 }
  72
  73 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
  74     x86_reg reglen = len;
  75     // not bit-exact: pf2id uses different rounding than C and SSE
  76     __asm__ volatile(
  77         "add        %0          , %0        \n\t"
  78         "lea         (%2,%0,2)  , %2        \n\t"
  79         "add        %0          , %1        \n\t"
  80         "neg        %0                      \n\t"
  81         "1:                                 \n\t"
  82         "pf2id       (%2,%0,2)  , %%mm0     \n\t"
  83         "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
  84         "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
  85         "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
  86         "packssdw   %%mm1       , %%mm0     \n\t"
  87         "packssdw   %%mm3       , %%mm2     \n\t"
  88         "movq       %%mm0       ,  (%1,%0)  \n\t"
  89         "movq       %%mm2       , 8(%1,%0)  \n\t"
  90         "add        $16         , %0        \n\t"
  91         " js 1b                             \n\t"
  92         "femms                              \n\t"
  93         :"+r"(reglen), "+r"(dst), "+r"(src)
  94     );
  95 }
  96
  97 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
  98     x86_reg reglen = len;
  99     __asm__ volatile(
 100         "add        %0          , %0        \n\t"
 101         "lea         (%2,%0,2)  , %2        \n\t"
 102         "add        %0          , %1        \n\t"
 103         "neg        %0                      \n\t"
 104         "1:                                 \n\t"
 105         "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
 106         "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
 107         "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
 108         "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
 109         "packssdw   %%mm1       , %%mm0     \n\t"
 110         "packssdw   %%mm3       , %%mm2     \n\t"
 111         "movq       %%mm0       ,  (%1,%0)  \n\t"
 112         "movq       %%mm2       , 8(%1,%0)  \n\t"
 113         "add        $16         , %0        \n\t"
 114         " js 1b                             \n\t"
 115         "emms                               \n\t"
 116         :"+r"(reglen), "+r"(dst), "+r"(src)
 117     );
 118 }
 119
 120 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
 121     x86_reg reglen = len;
 122     __asm__ volatile(
 123         "add        %0          , %0        \n\t"
 124         "lea         (%2,%0,2)  , %2        \n\t"
 125         "add        %0          , %1        \n\t"
 126         "neg        %0                      \n\t"
 127         "1:                                 \n\t"
 128         "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
 129         "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
 130         "packssdw   %%xmm1      , %%xmm0    \n\t"
 131         "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
 132         "add        $16         , %0        \n\t"
 133         " js 1b                             \n\t"
 134         :"+r"(reglen), "+r"(dst), "+r"(src)
 135     );
 136 }
 137
 138 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 139 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 140 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 141
 142 #if !HAVE_YASM
 143 #define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
 144 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
 145 #define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
 146 #endif
 147 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 148
 149 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
 150 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 151 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
 152     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
 153     int i,j,c;\
 154     for(c=0; c<channels; c++){\
 155         float_to_int16_##cpu(tmp, src[c], len);\
 156         for(i=0, j=c; i<len; i++, j+=channels)\
 157             dst[j] = tmp[i];\
 158     }\
 159 }\
 160 \
 161 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
 162     if(channels==1)\
 163         float_to_int16_##cpu(dst, src[0], len);\
 164     else if(channels==2){\
 165         x86_reg reglen = len; \
 166         const float *src0 = src[0];\
 167         const float *src1 = src[1];\
 168         __asm__ volatile(\
 169             "shl $2, %0 \n"\
 170             "add %0, %1 \n"\
 171             "add %0, %2 \n"\
 172             "add %0, %3 \n"\
 173             "neg %0 \n"\
 174             body\
 175             :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
 176         );\
 177     }else if(channels==6){\
 178         ff_float_to_int16_interleave6_##cpu(dst, src, len);\
 179     }else\
 180         float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 181 }
 182
 183 FLOAT_TO_INT16_INTERLEAVE(3dnow,
 184     "1:                         \n"
 185     "pf2id     (%2,%0), %%mm0   \n"
 186     "pf2id    8(%2,%0), %%mm1   \n"
 187     "pf2id     (%3,%0), %%mm2   \n"
 188     "pf2id    8(%3,%0), %%mm3   \n"
 189     "packssdw    %%mm1, %%mm0   \n"
 190     "packssdw    %%mm3, %%mm2   \n"
 191     "movq        %%mm0, %%mm1   \n"
 192     "punpcklwd   %%mm2, %%mm0   \n"
 193     "punpckhwd   %%mm2, %%mm1   \n"
 194     "movq        %%mm0,  (%1,%0)\n"
 195     "movq        %%mm1, 8(%1,%0)\n"
 196     "add $16, %0                \n"
 197     "js 1b                      \n"
 198     "femms                      \n"
 199 )
 200
 201 FLOAT_TO_INT16_INTERLEAVE(sse,
 202     "1:                         \n"
 203     "cvtps2pi  (%2,%0), %%mm0   \n"
 204     "cvtps2pi 8(%2,%0), %%mm1   \n"
 205     "cvtps2pi  (%3,%0), %%mm2   \n"
 206     "cvtps2pi 8(%3,%0), %%mm3   \n"
 207     "packssdw    %%mm1, %%mm0   \n"
 208     "packssdw    %%mm3, %%mm2   \n"
 209     "movq        %%mm0, %%mm1   \n"
 210     "punpcklwd   %%mm2, %%mm0   \n"
 211     "punpckhwd   %%mm2, %%mm1   \n"
 212     "movq        %%mm0,  (%1,%0)\n"
 213     "movq        %%mm1, 8(%1,%0)\n"
 214     "add $16, %0                \n"
 215     "js 1b                      \n"
 216     "emms                       \n"
 217 )
 218
 219 FLOAT_TO_INT16_INTERLEAVE(sse2,
 220     "1:                         \n"
 221     "cvtps2dq  (%2,%0), %%xmm0  \n"
 222     "cvtps2dq  (%3,%0), %%xmm1  \n"
 223     "packssdw   %%xmm1, %%xmm0  \n"
 224     "movhlps    %%xmm0, %%xmm1  \n"
 225     "punpcklwd  %%xmm1, %%xmm0  \n"
 226     "movdqa     %%xmm0, (%1,%0) \n"
 227     "add $16, %0                \n"
 228     "js 1b                      \n"
 229 )
 230
 231 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
 232     if(channels==6)
 233         ff_float_to_int16_interleave6_3dn2(dst, src, len);
 234     else
 235         float_to_int16_interleave_3dnow(dst, src, len, channels);
 236 }
 237
 238 #if HAVE_YASM
 239 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
 240 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
 241
 242 void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
 243 void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
 244
 245 static void float_interleave_mmx(float *dst, const float **src,
 246                                  unsigned int len, int channels)
 247 {
 248     if (channels == 2) {
 249         ff_float_interleave2_mmx(dst, src, len);
 250     } else if (channels == 6)
 251         ff_float_interleave6_mmx(dst, src, len);
 252     else
 253         ff_float_interleave_c(dst, src, len, channels);
 254 }
 255
 256 static void float_interleave_sse(float *dst, const float **src,
 257                                  unsigned int len, int channels)
 258 {
 259     if (channels == 2) {
 260         ff_float_interleave2_sse(dst, src, len);
 261     } else if (channels == 6)
 262         ff_float_interleave6_sse(dst, src, len);
 263     else
 264         ff_float_interleave_c(dst, src, len, channels);
 265 }
 266 #endif
 267
 268 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 269 {
 270     int mm_flags = av_get_cpu_flags();
 271
 272     if (mm_flags & AV_CPU_FLAG_MMX) {
 273 #if HAVE_YASM
 274         c->float_interleave = float_interleave_mmx;
 275 #endif
 276
 277         if(mm_flags & AV_CPU_FLAG_3DNOW){
 278             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
 279                 c->float_to_int16 = float_to_int16_3dnow;
 280                 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
 281             }
 282         }
 283         if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
 284             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
 285                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
 286             }
 287         }
 288         if(mm_flags & AV_CPU_FLAG_SSE){
 289             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
 290             c->float_to_int16 = float_to_int16_sse;
 291             c->float_to_int16_interleave = float_to_int16_interleave_sse;
 292 #if HAVE_YASM
 293             c->float_interleave = float_interleave_sse;
 294 #endif
 295         }
 296         if(mm_flags & AV_CPU_FLAG_SSE2){
 297             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
 298             c->float_to_int16 = float_to_int16_sse2;
 299             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
 300         }
 301     }
 302 }