Upstream version 9.37.197.0
[platform/framework/web/crosswalk.git] / src / third_party / ffmpeg / libavcodec / x86 / motion_est.c
1 /*
2  * MMX optimized motion estimation
3  * Copyright (c) 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer
5  *
6  * mostly by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "dsputil_x86.h"
32
33 #if HAVE_INLINE_ASM
34
35 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
36     0x0000000000000000ULL,
37     0x0001000100010001ULL,
38     0x0002000200020002ULL,
39 };
40
41 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
42
43 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
44 {
45     x86_reg len = -(x86_reg)stride * h;
46     __asm__ volatile (
47         ".p2align 4                     \n\t"
48         "1:                             \n\t"
49         "movq (%1, %%"REG_a"), %%mm0    \n\t"
50         "movq (%2, %%"REG_a"), %%mm2    \n\t"
51         "movq (%2, %%"REG_a"), %%mm4    \n\t"
52         "add %3, %%"REG_a"              \n\t"
53         "psubusb %%mm0, %%mm2           \n\t"
54         "psubusb %%mm4, %%mm0           \n\t"
55         "movq (%1, %%"REG_a"), %%mm1    \n\t"
56         "movq (%2, %%"REG_a"), %%mm3    \n\t"
57         "movq (%2, %%"REG_a"), %%mm5    \n\t"
58         "psubusb %%mm1, %%mm3           \n\t"
59         "psubusb %%mm5, %%mm1           \n\t"
60         "por %%mm2, %%mm0               \n\t"
61         "por %%mm1, %%mm3               \n\t"
62         "movq %%mm0, %%mm1              \n\t"
63         "movq %%mm3, %%mm2              \n\t"
64         "punpcklbw %%mm7, %%mm0         \n\t"
65         "punpckhbw %%mm7, %%mm1         \n\t"
66         "punpcklbw %%mm7, %%mm3         \n\t"
67         "punpckhbw %%mm7, %%mm2         \n\t"
68         "paddw %%mm1, %%mm0             \n\t"
69         "paddw %%mm3, %%mm2             \n\t"
70         "paddw %%mm2, %%mm0             \n\t"
71         "paddw %%mm0, %%mm6             \n\t"
72         "add %3, %%"REG_a"              \n\t"
73         " js 1b                         \n\t"
74         : "+a" (len)
75         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
76 }
77
78 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
79                                  int stride, int h)
80 {
81     __asm__ volatile (
82         ".p2align 4                     \n\t"
83         "1:                             \n\t"
84         "movq (%1), %%mm0               \n\t"
85         "movq (%1, %3), %%mm1           \n\t"
86         "psadbw (%2), %%mm0             \n\t"
87         "psadbw (%2, %3), %%mm1         \n\t"
88         "paddw %%mm0, %%mm6             \n\t"
89         "paddw %%mm1, %%mm6             \n\t"
90         "lea (%1,%3,2), %1              \n\t"
91         "lea (%2,%3,2), %2              \n\t"
92         "sub $2, %0                     \n\t"
93         " jg 1b                         \n\t"
94         : "+r" (h), "+r" (blk1), "+r" (blk2)
95         : "r" ((x86_reg) stride));
96 }
97
98 static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
99                       int stride, int h)
100 {
101     int ret;
102     __asm__ volatile (
103         "pxor %%xmm2, %%xmm2            \n\t"
104         ".p2align 4                     \n\t"
105         "1:                             \n\t"
106         "movdqu (%1), %%xmm0            \n\t"
107         "movdqu (%1, %4), %%xmm1        \n\t"
108         "psadbw (%2), %%xmm0            \n\t"
109         "psadbw (%2, %4), %%xmm1        \n\t"
110         "paddw %%xmm0, %%xmm2           \n\t"
111         "paddw %%xmm1, %%xmm2           \n\t"
112         "lea (%1,%4,2), %1              \n\t"
113         "lea (%2,%4,2), %2              \n\t"
114         "sub $2, %0                     \n\t"
115         " jg 1b                         \n\t"
116         "movhlps %%xmm2, %%xmm0         \n\t"
117         "paddw   %%xmm0, %%xmm2         \n\t"
118         "movd    %%xmm2, %3             \n\t"
119         : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
120         : "r" ((x86_reg) stride));
121     return ret;
122 }
123
124 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
125                                    int stride, int h)
126 {
127     __asm__ volatile (
128         ".p2align 4                     \n\t"
129         "1:                             \n\t"
130         "movq (%1), %%mm0               \n\t"
131         "movq (%1, %3), %%mm1           \n\t"
132         "pavgb 1(%1), %%mm0             \n\t"
133         "pavgb 1(%1, %3), %%mm1         \n\t"
134         "psadbw (%2), %%mm0             \n\t"
135         "psadbw (%2, %3), %%mm1         \n\t"
136         "paddw %%mm0, %%mm6             \n\t"
137         "paddw %%mm1, %%mm6             \n\t"
138         "lea (%1,%3,2), %1              \n\t"
139         "lea (%2,%3,2), %2              \n\t"
140         "sub $2, %0                     \n\t"
141         " jg 1b                         \n\t"
142         : "+r" (h), "+r" (blk1), "+r" (blk2)
143         : "r" ((x86_reg) stride));
144 }
145
146 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
147                                    int stride, int h)
148 {
149     __asm__ volatile (
150         "movq (%1), %%mm0               \n\t"
151         "add %3, %1                     \n\t"
152         ".p2align 4                     \n\t"
153         "1:                             \n\t"
154         "movq (%1), %%mm1               \n\t"
155         "movq (%1, %3), %%mm2           \n\t"
156         "pavgb %%mm1, %%mm0             \n\t"
157         "pavgb %%mm2, %%mm1             \n\t"
158         "psadbw (%2), %%mm0             \n\t"
159         "psadbw (%2, %3), %%mm1         \n\t"
160         "paddw %%mm0, %%mm6             \n\t"
161         "paddw %%mm1, %%mm6             \n\t"
162         "movq %%mm2, %%mm0              \n\t"
163         "lea (%1,%3,2), %1              \n\t"
164         "lea (%2,%3,2), %2              \n\t"
165         "sub $2, %0                     \n\t"
166         " jg 1b                         \n\t"
167         : "+r" (h), "+r" (blk1), "+r" (blk2)
168         : "r" ((x86_reg) stride));
169 }
170
171 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
172                                  int stride, int h)
173 {
174     __asm__ volatile (
175         "movq "MANGLE(bone)", %%mm5     \n\t"
176         "movq (%1), %%mm0               \n\t"
177         "pavgb 1(%1), %%mm0             \n\t"
178         "add %3, %1                     \n\t"
179         ".p2align 4                     \n\t"
180         "1:                             \n\t"
181         "movq (%1), %%mm1               \n\t"
182         "movq (%1,%3), %%mm2            \n\t"
183         "pavgb 1(%1), %%mm1             \n\t"
184         "pavgb 1(%1,%3), %%mm2          \n\t"
185         "psubusb %%mm5, %%mm1           \n\t"
186         "pavgb %%mm1, %%mm0             \n\t"
187         "pavgb %%mm2, %%mm1             \n\t"
188         "psadbw (%2), %%mm0             \n\t"
189         "psadbw (%2,%3), %%mm1          \n\t"
190         "paddw %%mm0, %%mm6             \n\t"
191         "paddw %%mm1, %%mm6             \n\t"
192         "movq %%mm2, %%mm0              \n\t"
193         "lea (%1,%3,2), %1              \n\t"
194         "lea (%2,%3,2), %2              \n\t"
195         "sub $2, %0                     \n\t"
196         " jg 1b                         \n\t"
197         : "+r" (h), "+r" (blk1), "+r" (blk2)
198         : "r" ((x86_reg) stride)
199           NAMED_CONSTRAINTS_ADD(bone));
200 }
201
202 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
203                               int stride, int h)
204 {
205     x86_reg len = -(x86_reg)stride * h;
206     __asm__ volatile (
207         ".p2align 4                     \n\t"
208         "1:                             \n\t"
209         "movq (%1, %%"REG_a"), %%mm0    \n\t"
210         "movq (%2, %%"REG_a"), %%mm1    \n\t"
211         "movq (%1, %%"REG_a"), %%mm2    \n\t"
212         "movq (%2, %%"REG_a"), %%mm3    \n\t"
213         "punpcklbw %%mm7, %%mm0         \n\t"
214         "punpcklbw %%mm7, %%mm1         \n\t"
215         "punpckhbw %%mm7, %%mm2         \n\t"
216         "punpckhbw %%mm7, %%mm3         \n\t"
217         "paddw %%mm0, %%mm1             \n\t"
218         "paddw %%mm2, %%mm3             \n\t"
219         "movq (%3, %%"REG_a"), %%mm4    \n\t"
220         "movq (%3, %%"REG_a"), %%mm2    \n\t"
221         "paddw %%mm5, %%mm1             \n\t"
222         "paddw %%mm5, %%mm3             \n\t"
223         "psrlw $1, %%mm1                \n\t"
224         "psrlw $1, %%mm3                \n\t"
225         "packuswb %%mm3, %%mm1          \n\t"
226         "psubusb %%mm1, %%mm4           \n\t"
227         "psubusb %%mm2, %%mm1           \n\t"
228         "por %%mm4, %%mm1               \n\t"
229         "movq %%mm1, %%mm0              \n\t"
230         "punpcklbw %%mm7, %%mm0         \n\t"
231         "punpckhbw %%mm7, %%mm1         \n\t"
232         "paddw %%mm1, %%mm0             \n\t"
233         "paddw %%mm0, %%mm6             \n\t"
234         "add %4, %%"REG_a"              \n\t"
235         " js 1b                         \n\t"
236         : "+a" (len)
237         : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
238           "r" ((x86_reg) stride));
239 }
240
241 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
242 {
243     x86_reg len = -(x86_reg)stride * h;
244     __asm__ volatile (
245         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
246         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
247         "movq %%mm0, %%mm1              \n\t"
248         "movq %%mm2, %%mm3              \n\t"
249         "punpcklbw %%mm7, %%mm0         \n\t"
250         "punpckhbw %%mm7, %%mm1         \n\t"
251         "punpcklbw %%mm7, %%mm2         \n\t"
252         "punpckhbw %%mm7, %%mm3         \n\t"
253         "paddw %%mm2, %%mm0             \n\t"
254         "paddw %%mm3, %%mm1             \n\t"
255         ".p2align 4                     \n\t"
256         "1:                             \n\t"
257         "movq  (%2, %%"REG_a"), %%mm2   \n\t"
258         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
259         "movq %%mm2, %%mm3              \n\t"
260         "movq %%mm4, %%mm5              \n\t"
261         "punpcklbw %%mm7, %%mm2         \n\t"
262         "punpckhbw %%mm7, %%mm3         \n\t"
263         "punpcklbw %%mm7, %%mm4         \n\t"
264         "punpckhbw %%mm7, %%mm5         \n\t"
265         "paddw %%mm4, %%mm2             \n\t"
266         "paddw %%mm5, %%mm3             \n\t"
267         "movq %5, %%mm5                 \n\t"
268         "paddw %%mm2, %%mm0             \n\t"
269         "paddw %%mm3, %%mm1             \n\t"
270         "paddw %%mm5, %%mm0             \n\t"
271         "paddw %%mm5, %%mm1             \n\t"
272         "movq (%3, %%"REG_a"), %%mm4    \n\t"
273         "movq (%3, %%"REG_a"), %%mm5    \n\t"
274         "psrlw $2, %%mm0                \n\t"
275         "psrlw $2, %%mm1                \n\t"
276         "packuswb %%mm1, %%mm0          \n\t"
277         "psubusb %%mm0, %%mm4           \n\t"
278         "psubusb %%mm5, %%mm0           \n\t"
279         "por %%mm4, %%mm0               \n\t"
280         "movq %%mm0, %%mm4              \n\t"
281         "punpcklbw %%mm7, %%mm0         \n\t"
282         "punpckhbw %%mm7, %%mm4         \n\t"
283         "paddw %%mm0, %%mm6             \n\t"
284         "paddw %%mm4, %%mm6             \n\t"
285         "movq  %%mm2, %%mm0             \n\t"
286         "movq  %%mm3, %%mm1             \n\t"
287         "add %4, %%"REG_a"              \n\t"
288         " js 1b                         \n\t"
289         : "+a" (len)
290         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
291           "r" ((x86_reg) stride), "m" (round_tab[2]));
292 }
293
294 static inline int sum_mmx(void)
295 {
296     int ret;
297     __asm__ volatile (
298         "movq %%mm6, %%mm0              \n\t"
299         "psrlq $32, %%mm6               \n\t"
300         "paddw %%mm0, %%mm6             \n\t"
301         "movq %%mm6, %%mm0              \n\t"
302         "psrlq $16, %%mm6               \n\t"
303         "paddw %%mm0, %%mm6             \n\t"
304         "movd %%mm6, %0                 \n\t"
305         : "=r" (ret));
306     return ret & 0xFFFF;
307 }
308
309 static inline int sum_mmxext(void)
310 {
311     int ret;
312     __asm__ volatile (
313         "movd %%mm6, %0                 \n\t"
314         : "=r" (ret));
315     return ret;
316 }
317
318 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
319 {
320     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
321 }
322
323 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
324 {
325     sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
326 }
327
328 #define PIX_SAD(suf)                                                    \
329 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
330                         uint8_t *blk1, int stride, int h)               \
331 {                                                                       \
332     av_assert2(h == 8);                                                     \
333     __asm__ volatile (                                                  \
334         "pxor %%mm7, %%mm7     \n\t"                                    \
335         "pxor %%mm6, %%mm6     \n\t"                                    \
336         :);                                                             \
337                                                                         \
338     sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
339                                                                         \
340     return sum_ ## suf();                                               \
341 }                                                                       \
342                                                                         \
343 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
344                            uint8_t *blk1, int stride, int h)            \
345 {                                                                       \
346     av_assert2(h == 8);                                                     \
347     __asm__ volatile (                                                  \
348         "pxor %%mm7, %%mm7     \n\t"                                    \
349         "pxor %%mm6, %%mm6     \n\t"                                    \
350         "movq %0, %%mm5        \n\t"                                    \
351         :: "m" (round_tab[1]));                                         \
352                                                                         \
353     sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
354                                                                         \
355     return sum_ ## suf();                                               \
356 }                                                                       \
357                                                                         \
358 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
359                            uint8_t *blk1, int stride, int h)            \
360 {                                                                       \
361     av_assert2(h == 8);                                                     \
362     __asm__ volatile (                                                  \
363         "pxor %%mm7, %%mm7     \n\t"                                    \
364         "pxor %%mm6, %%mm6     \n\t"                                    \
365         "movq %0, %%mm5        \n\t"                                    \
366         :: "m" (round_tab[1]));                                         \
367                                                                         \
368     sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
369                                                                         \
370     return sum_ ## suf();                                               \
371 }                                                                       \
372                                                                         \
373 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
374                             uint8_t *blk1, int stride, int h)           \
375 {                                                                       \
376     av_assert2(h == 8);                                                     \
377     __asm__ volatile (                                                  \
378         "pxor %%mm7, %%mm7     \n\t"                                    \
379         "pxor %%mm6, %%mm6     \n\t"                                    \
380         ::);                                                            \
381                                                                         \
382     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
383                                                                         \
384     return sum_ ## suf();                                               \
385 }                                                                       \
386                                                                         \
387 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
388                          uint8_t *blk1, int stride, int h)              \
389 {                                                                       \
390     __asm__ volatile (                                                  \
391         "pxor %%mm7, %%mm7     \n\t"                                    \
392         "pxor %%mm6, %%mm6     \n\t"                                    \
393         :);                                                             \
394                                                                         \
395     sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
396     sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
397                                                                         \
398     return sum_ ## suf();                                               \
399 }                                                                       \
400                                                                         \
401 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
402                             uint8_t *blk1, int stride, int h)           \
403 {                                                                       \
404     __asm__ volatile (                                                  \
405         "pxor %%mm7, %%mm7     \n\t"                                    \
406         "pxor %%mm6, %%mm6     \n\t"                                    \
407         "movq %0, %%mm5        \n\t"                                    \
408         :: "m" (round_tab[1]));                                         \
409                                                                         \
410     sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
411     sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
412                                                                         \
413     return sum_ ## suf();                                               \
414 }                                                                       \
415                                                                         \
416 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
417                             uint8_t *blk1, int stride, int h)           \
418 {                                                                       \
419     __asm__ volatile (                                                  \
420         "pxor %%mm7, %%mm7     \n\t"                                    \
421         "pxor %%mm6, %%mm6     \n\t"                                    \
422         "movq %0, %%mm5        \n\t"                                    \
423         :: "m" (round_tab[1]));                                         \
424                                                                         \
425     sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
426     sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
427                                                                         \
428     return sum_ ## suf();                                               \
429 }                                                                       \
430                                                                         \
431 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
432                              uint8_t *blk1, int stride, int h)          \
433 {                                                                       \
434     __asm__ volatile (                                                  \
435         "pxor %%mm7, %%mm7     \n\t"                                    \
436         "pxor %%mm6, %%mm6     \n\t"                                    \
437         ::);                                                            \
438                                                                         \
439     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
440     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
441                                                                         \
442     return sum_ ## suf();                                               \
443 }                                                                       \
444
445 PIX_SAD(mmx)
446 PIX_SAD(mmxext)
447
448 #endif /* HAVE_INLINE_ASM */
449
450 av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
451 {
452 #if HAVE_INLINE_ASM
453     int cpu_flags = av_get_cpu_flags();
454
455     if (INLINE_MMX(cpu_flags)) {
456         c->pix_abs[0][0] = sad16_mmx;
457         c->pix_abs[0][1] = sad16_x2_mmx;
458         c->pix_abs[0][2] = sad16_y2_mmx;
459         c->pix_abs[0][3] = sad16_xy2_mmx;
460         c->pix_abs[1][0] = sad8_mmx;
461         c->pix_abs[1][1] = sad8_x2_mmx;
462         c->pix_abs[1][2] = sad8_y2_mmx;
463         c->pix_abs[1][3] = sad8_xy2_mmx;
464
465         c->sad[0] = sad16_mmx;
466         c->sad[1] = sad8_mmx;
467     }
468     if (INLINE_MMXEXT(cpu_flags)) {
469         c->pix_abs[0][0] = sad16_mmxext;
470         c->pix_abs[1][0] = sad8_mmxext;
471
472         c->sad[0] = sad16_mmxext;
473         c->sad[1] = sad8_mmxext;
474
475         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
476             c->pix_abs[0][1] = sad16_x2_mmxext;
477             c->pix_abs[0][2] = sad16_y2_mmxext;
478             c->pix_abs[0][3] = sad16_xy2_mmxext;
479             c->pix_abs[1][1] = sad8_x2_mmxext;
480             c->pix_abs[1][2] = sad8_y2_mmxext;
481             c->pix_abs[1][3] = sad8_xy2_mmxext;
482         }
483     }
484     if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
485         c->sad[0] = sad16_sse2;
486     }
487 #endif /* HAVE_INLINE_ASM */
488 }