src/third_party/ffmpeg/libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = almost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use git log
  75
  76 #include "config.h"
  77 #include "libavutil/avutil.h"
  78 #include "libavutil/avassert.h"
  79 #include <inttypes.h>
  80 #include <stdio.h>
  81 #include <stdlib.h>
  82 #include <string.h>
  83 //#undef HAVE_MMXEXT_INLINE
  84 //#define HAVE_AMD3DNOW_INLINE
  85 //#undef HAVE_MMX_INLINE
  86 //#undef ARCH_X86
  87 //#define DEBUG_BRIGHTNESS
  88 #include "postprocess.h"
  89 #include "postprocess_internal.h"
  90 #include "libavutil/avstring.h"
  91
  92 unsigned postproc_version(void)
  93 {
  94     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
  95     return LIBPOSTPROC_VERSION_INT;
  96 }
  97
  98 const char *postproc_configuration(void)
  99 {
 100     return FFMPEG_CONFIGURATION;
 101 }
 102
 103 const char *postproc_license(void)
 104 {
 105 #define LICENSE_PREFIX "libpostproc license: "
 106     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
 107 }
 108
 109 #if HAVE_ALTIVEC_H
 110 #include <altivec.h>
 111 #endif
 112
 113 #define GET_MODE_BUFFER_SIZE 500
 114 #define OPTIONS_ARRAY_SIZE 10
 115 #define BLOCK_SIZE 8
 116 #define TEMP_STRIDE 8
 117 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 118
 119 #if ARCH_X86 && HAVE_INLINE_ASM
 120 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 121 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 122 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 123 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 124 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 125 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 126 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 127 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 128 #endif
 129
 130 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 131
 132
 133 static const struct PPFilter filters[]=
 134 {
 135     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 136     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 137 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 138     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 139     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 140     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 141     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 142     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 143     {"dr", "dering",                1, 5, 6, DERING},
 144     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 145     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 146     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 147     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 148     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 149     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 150     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 151     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 152     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 153     {"be", "bitexact",              1, 0, 0, BITEXACT},
 154     {NULL, NULL,0,0,0,0} //End Marker
 155 };
 156
 157 static const char * const replaceTable[]=
 158 {
 159     "default",      "hb:a,vb:a,dr:a",
 160     "de",           "hb:a,vb:a,dr:a",
 161     "fast",         "h1:a,v1:a,dr:a",
 162     "fa",           "h1:a,v1:a,dr:a",
 163     "ac",           "ha:a:128:7,va:a,dr:a",
 164     NULL //End Marker
 165 };
 166
 167
 168 #if ARCH_X86 && HAVE_INLINE_ASM
 169 static inline void prefetchnta(void *p)
 170 {
 171     __asm__ volatile(   "prefetchnta (%0)\n\t"
 172         : : "r" (p)
 173     );
 174 }
 175
 176 static inline void prefetcht0(void *p)
 177 {
 178     __asm__ volatile(   "prefetcht0 (%0)\n\t"
 179         : : "r" (p)
 180     );
 181 }
 182
 183 static inline void prefetcht1(void *p)
 184 {
 185     __asm__ volatile(   "prefetcht1 (%0)\n\t"
 186         : : "r" (p)
 187     );
 188 }
 189
 190 static inline void prefetcht2(void *p)
 191 {
 192     __asm__ volatile(   "prefetcht2 (%0)\n\t"
 193         : : "r" (p)
 194     );
 195 }
 196 #endif
 197
 198 /* The horizontal functions exist only in C because the MMX
 199  * code is faster with vertical filters and transposing. */
 200
 201 /**
 202  * Check if the given 8x8 Block is mostly "flat"
 203  */
 204 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
 205 {
 206     int numEq= 0;
 207     int y;
 208     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 209     const int dcThreshold= dcOffset*2 + 1;
 210
 211     for(y=0; y<BLOCK_SIZE; y++){
 212         if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
 213         if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
 214         if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
 215         if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
 216         if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
 217         if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
 218         if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
 219         src+= stride;
 220     }
 221     return numEq > c->ppMode.flatnessThreshold;
 222 }
 223
 224 /**
 225  * Check if the middle 8x8 Block in the given 8x16 block is flat
 226  */
 227 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
 228 {
 229     int numEq= 0;
 230     int y;
 231     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 232     const int dcThreshold= dcOffset*2 + 1;
 233
 234     src+= stride*4; // src points to begin of the 8x8 Block
 235     for(y=0; y<BLOCK_SIZE-1; y++){
 236         if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
 237         if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
 238         if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
 239         if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
 240         if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
 241         if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
 242         if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
 243         if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
 244         src+= stride;
 245     }
 246     return numEq > c->ppMode.flatnessThreshold;
 247 }
 248
 249 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
 250 {
 251     int i;
 252     for(i=0; i<2; i++){
 253         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 254         src += stride;
 255         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 256         src += stride;
 257         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 258         src += stride;
 259         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 260         src += stride;
 261     }
 262     return 1;
 263 }
 264
 265 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
 266 {
 267     int x;
 268     src+= stride*4;
 269     for(x=0; x<BLOCK_SIZE; x+=4){
 270         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 271         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 272         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 273         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 274     }
 275     return 1;
 276 }
 277
 278 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
 279 {
 280     if( isHorizDC_C(src, stride, c) ){
 281         if( isHorizMinMaxOk_C(src, stride, c->QP) )
 282             return 1;
 283         else
 284             return 0;
 285     }else{
 286         return 2;
 287     }
 288 }
 289
 290 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
 291 {
 292     if( isVertDC_C(src, stride, c) ){
 293         if( isVertMinMaxOk_C(src, stride, c->QP) )
 294             return 1;
 295         else
 296             return 0;
 297     }else{
 298         return 2;
 299     }
 300 }
 301
 302 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
 303 {
 304     int y;
 305     for(y=0; y<BLOCK_SIZE; y++){
 306         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 307
 308         if(FFABS(middleEnergy) < 8*c->QP){
 309             const int q=(dst[3] - dst[4])/2;
 310             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 311             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 312
 313             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 314             d= FFMAX(d, 0);
 315
 316             d= (5*d + 32) >> 6;
 317             d*= FFSIGN(-middleEnergy);
 318
 319             if(q>0)
 320             {
 321                 d = FFMAX(d, 0);
 322                 d = FFMIN(d, q);
 323             }
 324             else
 325             {
 326                 d = FFMIN(d, 0);
 327                 d = FFMAX(d, q);
 328             }
 329
 330             dst[3]-= d;
 331             dst[4]+= d;
 332         }
 333         dst+= stride;
 334     }
 335 }
 336
 337 /**
 338  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 339  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 340  */
 341 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
 342 {
 343     int y;
 344     for(y=0; y<BLOCK_SIZE; y++){
 345         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 346         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 347
 348         int sums[10];
 349         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 350         sums[1] = sums[0] - first  + dst[3];
 351         sums[2] = sums[1] - first  + dst[4];
 352         sums[3] = sums[2] - first  + dst[5];
 353         sums[4] = sums[3] - first  + dst[6];
 354         sums[5] = sums[4] - dst[0] + dst[7];
 355         sums[6] = sums[5] - dst[1] + last;
 356         sums[7] = sums[6] - dst[2] + last;
 357         sums[8] = sums[7] - dst[3] + last;
 358         sums[9] = sums[8] - dst[4] + last;
 359
 360         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 361         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 362         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 363         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 364         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 365         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 366         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 367         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 368
 369         dst+= stride;
 370     }
 371 }
 372
 373 /**
 374  * Experimental Filter 1 (Horizontal)
 375  * will not damage linear gradients
 376  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 377  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 378  * MMX2 version does correct clipping C version does not
 379  * not identical with the vertical one
 380  */
 381 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 382 {
 383     int y;
 384     static uint64_t lut[256];
 385     if(!lut[255])
 386     {
 387         int i;
 388         for(i=0; i<256; i++)
 389         {
 390             int v= i < 128 ? 2*i : 2*(i-256);
 391 /*
 392 //Simulate 112242211 9-Tap filter
 393             uint64_t a= (v/16)  & 0xFF;
 394             uint64_t b= (v/8)   & 0xFF;
 395             uint64_t c= (v/4)   & 0xFF;
 396             uint64_t d= (3*v/8) & 0xFF;
 397 */
 398 //Simulate piecewise linear interpolation
 399             uint64_t a= (v/16)   & 0xFF;
 400             uint64_t b= (v*3/16) & 0xFF;
 401             uint64_t c= (v*5/16) & 0xFF;
 402             uint64_t d= (7*v/16) & 0xFF;
 403             uint64_t A= (0x100 - a)&0xFF;
 404             uint64_t B= (0x100 - b)&0xFF;
 405             uint64_t C= (0x100 - c)&0xFF;
 406             uint64_t D= (0x100 - c)&0xFF;
 407
 408             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 409                        (D<<24) | (C<<16) | (B<<8)  | (A);
 410             //lut[i] = (v<<32) | (v<<24);
 411         }
 412     }
 413
 414     for(y=0; y<BLOCK_SIZE; y++){
 415         int a= src[1] - src[2];
 416         int b= src[3] - src[4];
 417         int c= src[5] - src[6];
 418
 419         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 420
 421         if(d < QP){
 422             int v = d * FFSIGN(-b);
 423
 424             src[1] +=v/8;
 425             src[2] +=v/4;
 426             src[3] +=3*v/8;
 427             src[4] -=3*v/8;
 428             src[5] -=v/4;
 429             src[6] -=v/8;
 430         }
 431         src+=stride;
 432     }
 433 }
 434
 435 /**
 436  * accurate deblock filter
 437  */
 438 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 439                                             int stride, const PPContext *c)
 440 {
 441     int y;
 442     const int QP= c->QP;
 443     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 444     const int dcThreshold= dcOffset*2 + 1;
 445 //START_TIMER
 446     src+= step*4; // src points to begin of the 8x8 Block
 447     for(y=0; y<8; y++){
 448         int numEq= 0;
 449
 450         if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
 451         if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
 452         if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
 453         if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
 454         if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
 455         if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
 456         if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
 457         if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
 458         if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
 459         if(numEq > c->ppMode.flatnessThreshold){
 460             int min, max, x;
 461
 462             if(src[0] > src[step]){
 463                 max= src[0];
 464                 min= src[step];
 465             }else{
 466                 max= src[step];
 467                 min= src[0];
 468             }
 469             for(x=2; x<8; x+=2){
 470                 if(src[x*step] > src[(x+1)*step]){
 471                         if(src[x    *step] > max) max= src[ x   *step];
 472                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 473                 }else{
 474                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 475                         if(src[ x   *step] < min) min= src[ x   *step];
 476                 }
 477             }
 478             if(max-min < 2*QP){
 479                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 480                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 481
 482                 int sums[10];
 483                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 484                 sums[1] = sums[0] - first       + src[3*step];
 485                 sums[2] = sums[1] - first       + src[4*step];
 486                 sums[3] = sums[2] - first       + src[5*step];
 487                 sums[4] = sums[3] - first       + src[6*step];
 488                 sums[5] = sums[4] - src[0*step] + src[7*step];
 489                 sums[6] = sums[5] - src[1*step] + last;
 490                 sums[7] = sums[6] - src[2*step] + last;
 491                 sums[8] = sums[7] - src[3*step] + last;
 492                 sums[9] = sums[8] - src[4*step] + last;
 493
 494                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 495                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 496                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 497                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 498                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 499                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 500                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 501                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 502             }
 503         }else{
 504             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 505
 506             if(FFABS(middleEnergy) < 8*QP){
 507                 const int q=(src[3*step] - src[4*step])/2;
 508                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 509                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 510
 511                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 512                 d= FFMAX(d, 0);
 513
 514                 d= (5*d + 32) >> 6;
 515                 d*= FFSIGN(-middleEnergy);
 516
 517                 if(q>0){
 518                     d = FFMAX(d, 0);
 519                     d = FFMIN(d, q);
 520                 }else{
 521                     d = FFMIN(d, 0);
 522                     d = FFMAX(d, q);
 523                 }
 524
 525                 src[3*step]-= d;
 526                 src[4*step]+= d;
 527             }
 528         }
 529
 530         src += stride;
 531     }
 532 /*if(step==16){
 533     STOP_TIMER("step16")
 534 }else{
 535     STOP_TIMER("stepX")
 536 }*/
 537 }
 538
 539 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 540 //Plain C versions
 541 //we always compile C for testing which needs bitexactness
 542 #define TEMPLATE_PP_C 1
 543 #include "postprocess_template.c"
 544
 545 #if HAVE_ALTIVEC
 546 #   define TEMPLATE_PP_ALTIVEC 1
 547 #   include "postprocess_altivec_template.c"
 548 #   include "postprocess_template.c"
 549 #endif
 550
 551 #if ARCH_X86 && HAVE_INLINE_ASM
 552 #    if CONFIG_RUNTIME_CPUDETECT
 553 #        define TEMPLATE_PP_MMX 1
 554 #        include "postprocess_template.c"
 555 #        define TEMPLATE_PP_MMXEXT 1
 556 #        include "postprocess_template.c"
 557 #        define TEMPLATE_PP_3DNOW 1
 558 #        include "postprocess_template.c"
 559 #        define TEMPLATE_PP_SSE2 1
 560 #        include "postprocess_template.c"
 561 #    else
 562 #        if HAVE_SSE2_INLINE
 563 #            define TEMPLATE_PP_SSE2 1
 564 #            include "postprocess_template.c"
 565 #        elif HAVE_MMXEXT_INLINE
 566 #            define TEMPLATE_PP_MMXEXT 1
 567 #            include "postprocess_template.c"
 568 #        elif HAVE_AMD3DNOW_INLINE
 569 #            define TEMPLATE_PP_3DNOW 1
 570 #            include "postprocess_template.c"
 571 #        elif HAVE_MMX_INLINE
 572 #            define TEMPLATE_PP_MMX 1
 573 #            include "postprocess_template.c"
 574 #        endif
 575 #    endif
 576 #endif
 577
 578 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 579                       const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
 580
 581 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 582         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
 583 {
 584     pp_fn pp = postProcess_C;
 585     PPContext *c= (PPContext *)vc;
 586     PPMode *ppMode= (PPMode *)vm;
 587     c->ppMode= *ppMode; //FIXME
 588
 589     if (!(ppMode->lumMode & BITEXACT)) {
 590 #if CONFIG_RUNTIME_CPUDETECT
 591 #if ARCH_X86 && HAVE_INLINE_ASM
 592         // ordered per speed fastest first
 593         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
 594         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
 595         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
 596         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
 597 #elif HAVE_ALTIVEC
 598         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 599 #endif
 600 #else /* CONFIG_RUNTIME_CPUDETECT */
 601 #if     HAVE_SSE2_INLINE
 602         pp = postProcess_SSE2;
 603 #elif   HAVE_MMXEXT_INLINE
 604         pp = postProcess_MMX2;
 605 #elif HAVE_AMD3DNOW_INLINE
 606         pp = postProcess_3DNow;
 607 #elif HAVE_MMX_INLINE
 608         pp = postProcess_MMX;
 609 #elif HAVE_ALTIVEC
 610         pp = postProcess_altivec;
 611 #endif
 612 #endif /* !CONFIG_RUNTIME_CPUDETECT */
 613     }
 614
 615     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 616 }
 617
 618 /* -pp Command line Help
 619 */
 620 const char pp_help[] =
 621 "Available postprocessing filters:\n"
 622 "Filters                        Options\n"
 623 "short  long name       short   long option     Description\n"
 624 "*      *               a       autoq           CPU power dependent enabler\n"
 625 "                       c       chrom           chrominance filtering enabled\n"
 626 "                       y       nochrom         chrominance filtering disabled\n"
 627 "                       n       noluma          luma filtering disabled\n"
 628 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 629 "       1. difference factor: default=32, higher -> more deblocking\n"
 630 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 631 "                       the h & v deblocking filters share these\n"
 632 "                       so you can't set different thresholds for h / v\n"
 633 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 634 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 635 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 636 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 637 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 638 "dr     dering                                  deringing filter\n"
 639 "al     autolevels                              automatic brightness / contrast\n"
 640 "                       f        fullyrange     stretch luminance to (0..255)\n"
 641 "lb     linblenddeint                           linear blend deinterlacer\n"
 642 "li     linipoldeint                            linear interpolating deinterlace\n"
 643 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 644 "md     mediandeint                             median deinterlacer\n"
 645 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 646 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 647 "de     default                                 hb:a,vb:a,dr:a\n"
 648 "fa     fast                                    h1:a,v1:a,dr:a\n"
 649 "ac                                             ha:a:128:7,va:a,dr:a\n"
 650 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 651 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 652 "fq     forceQuant      <quantizer>             force quantizer\n"
 653 "Usage:\n"
 654 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 655 "long form example:\n"
 656 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 657 "short form example:\n"
 658 "vb:a/hb:a/lb                                   de,-vb\n"
 659 "more examples:\n"
 660 "tn:64:128:256\n"
 661 "\n"
 662 ;
 663
 664 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
 665 {
 666     char temp[GET_MODE_BUFFER_SIZE];
 667     char *p= temp;
 668     static const char filterDelimiters[] = ",/";
 669     static const char optionDelimiters[] = ":|";
 670     struct PPMode *ppMode;
 671     char *filterToken;
 672
 673     if (!name)  {
 674         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
 675         return NULL;
 676     }
 677
 678     if (!strcmp(name, "help")) {
 679         const char *p;
 680         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
 681             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
 682             av_log(NULL, AV_LOG_INFO, "%s", temp);
 683         }
 684         return NULL;
 685     }
 686
 687     ppMode= av_malloc(sizeof(PPMode));
 688
 689     ppMode->lumMode= 0;
 690     ppMode->chromMode= 0;
 691     ppMode->maxTmpNoise[0]= 700;
 692     ppMode->maxTmpNoise[1]= 1500;
 693     ppMode->maxTmpNoise[2]= 3000;
 694     ppMode->maxAllowedY= 234;
 695     ppMode->minAllowedY= 16;
 696     ppMode->baseDcDiff= 256/8;
 697     ppMode->flatnessThreshold= 56-16-1;
 698     ppMode->maxClippedThreshold= 0.01;
 699     ppMode->error=0;
 700
 701     memset(temp, 0, GET_MODE_BUFFER_SIZE);
 702     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
 703
 704     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 705
 706     for(;;){
 707         char *filterName;
 708         int q= 1000000; //PP_QUALITY_MAX;
 709         int chrom=-1;
 710         int luma=-1;
 711         char *option;
 712         char *options[OPTIONS_ARRAY_SIZE];
 713         int i;
 714         int filterNameOk=0;
 715         int numOfUnknownOptions=0;
 716         int enable=1; //does the user want us to enabled or disabled the filter
 717
 718         filterToken= strtok(p, filterDelimiters);
 719         if(!filterToken) break;
 720         p+= strlen(filterToken) + 1; // p points to next filterToken
 721         filterName= strtok(filterToken, optionDelimiters);
 722         if (!filterName) {
 723             ppMode->error++;
 724             break;
 725         }
 726         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 727
 728         if(*filterName == '-'){
 729             enable=0;
 730             filterName++;
 731         }
 732
 733         for(;;){ //for all options
 734             option= strtok(NULL, optionDelimiters);
 735             if(!option) break;
 736
 737             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 738             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 739             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 740             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 741             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 742             else{
 743                 options[numOfUnknownOptions] = option;
 744                 numOfUnknownOptions++;
 745             }
 746             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 747         }
 748         options[numOfUnknownOptions] = NULL;
 749
 750         /* replace stuff from the replace Table */
 751         for(i=0; replaceTable[2*i]; i++){
 752             if(!strcmp(replaceTable[2*i], filterName)){
 753                 int newlen= strlen(replaceTable[2*i + 1]);
 754                 int plen;
 755                 int spaceLeft;
 756
 757                 p--, *p=',';
 758
 759                 plen= strlen(p);
 760                 spaceLeft= p - temp + plen;
 761                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
 762                     ppMode->error++;
 763                     break;
 764                 }
 765                 memmove(p + newlen, p, plen+1);
 766                 memcpy(p, replaceTable[2*i + 1], newlen);
 767                 filterNameOk=1;
 768             }
 769         }
 770
 771         for(i=0; filters[i].shortName; i++){
 772             if(   !strcmp(filters[i].longName, filterName)
 773                || !strcmp(filters[i].shortName, filterName)){
 774                 ppMode->lumMode &= ~filters[i].mask;
 775                 ppMode->chromMode &= ~filters[i].mask;
 776
 777                 filterNameOk=1;
 778                 if(!enable) break; // user wants to disable it
 779
 780                 if(q >= filters[i].minLumQuality && luma)
 781                     ppMode->lumMode|= filters[i].mask;
 782                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 783                     if(q >= filters[i].minChromQuality)
 784                             ppMode->chromMode|= filters[i].mask;
 785
 786                 if(filters[i].mask == LEVEL_FIX){
 787                     int o;
 788                     ppMode->minAllowedY= 16;
 789                     ppMode->maxAllowedY= 234;
 790                     for(o=0; options[o]; o++){
 791                         if(  !strcmp(options[o],"fullyrange")
 792                            ||!strcmp(options[o],"f")){
 793                             ppMode->minAllowedY= 0;
 794                             ppMode->maxAllowedY= 255;
 795                             numOfUnknownOptions--;
 796                         }
 797                     }
 798                 }
 799                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 800                 {
 801                     int o;
 802                     int numOfNoises=0;
 803
 804                     for(o=0; options[o]; o++){
 805                         char *tail;
 806                         ppMode->maxTmpNoise[numOfNoises]=
 807                             strtol(options[o], &tail, 0);
 808                         if(tail!=options[o]){
 809                             numOfNoises++;
 810                             numOfUnknownOptions--;
 811                             if(numOfNoises >= 3) break;
 812                         }
 813                     }
 814                 }
 815                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 816                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
 817                     int o;
 818
 819                     for(o=0; options[o] && o<2; o++){
 820                         char *tail;
 821                         int val= strtol(options[o], &tail, 0);
 822                         if(tail==options[o]) break;
 823
 824                         numOfUnknownOptions--;
 825                         if(o==0) ppMode->baseDcDiff= val;
 826                         else ppMode->flatnessThreshold= val;
 827                     }
 828                 }
 829                 else if(filters[i].mask == FORCE_QUANT){
 830                     int o;
 831                     ppMode->forcedQuant= 15;
 832
 833                     for(o=0; options[o] && o<1; o++){
 834                         char *tail;
 835                         int val= strtol(options[o], &tail, 0);
 836                         if(tail==options[o]) break;
 837
 838                         numOfUnknownOptions--;
 839                         ppMode->forcedQuant= val;
 840                     }
 841                 }
 842             }
 843         }
 844         if(!filterNameOk) ppMode->error++;
 845         ppMode->error += numOfUnknownOptions;
 846     }
 847
 848     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 849     if(ppMode->error){
 850         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 851         av_free(ppMode);
 852         return NULL;
 853     }
 854     return ppMode;
 855 }
 856
 857 void pp_free_mode(pp_mode *mode){
 858     av_free(mode);
 859 }
 860
 861 static void reallocAlign(void **p, int alignment, int size){
 862     av_free(*p);
 863     *p= av_mallocz(size);
 864 }
 865
 866 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 867     int mbWidth = (width+15)>>4;
 868     int mbHeight= (height+15)>>4;
 869     int i;
 870
 871     c->stride= stride;
 872     c->qpStride= qpStride;
 873
 874     reallocAlign((void **)&c->tempDst, 8, stride*24+32);
 875     reallocAlign((void **)&c->tempSrc, 8, stride*24);
 876     reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
 877     reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
 878     for(i=0; i<256; i++)
 879             c->yHistogram[i]= width*height/64*15/256;
 880
 881     for(i=0; i<3; i++){
 882         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
 883         reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
 884         reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 885     }
 886
 887     reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
 888     reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 889     reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 890     reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
 891 }
 892
 893 static const char * context_to_name(void * ptr) {
 894     return "postproc";
 895 }
 896
 897 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 898
 899 pp_context *pp_get_context(int width, int height, int cpuCaps){
 900     PPContext *c= av_malloc(sizeof(PPContext));
 901     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
 902     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 903
 904     memset(c, 0, sizeof(PPContext));
 905     c->av_class = &av_codec_context_class;
 906     if(cpuCaps&PP_FORMAT){
 907         c->hChromaSubSample= cpuCaps&0x3;
 908         c->vChromaSubSample= (cpuCaps>>4)&0x3;
 909     }else{
 910         c->hChromaSubSample= 1;
 911         c->vChromaSubSample= 1;
 912     }
 913     if (cpuCaps & PP_CPU_CAPS_AUTO) {
 914         c->cpuCaps = av_get_cpu_flags();
 915     } else {
 916         c->cpuCaps = 0;
 917         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
 918         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
 919         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
 920         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
 921     }
 922
 923     reallocBuffers(c, width, height, stride, qpStride);
 924
 925     c->frameNum=-1;
 926
 927     return c;
 928 }
 929
 930 void pp_free_context(void *vc){
 931     PPContext *c = (PPContext*)vc;
 932     int i;
 933
 934     for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
 935     for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
 936
 937     av_free(c->tempBlocks);
 938     av_free(c->yHistogram);
 939     av_free(c->tempDst);
 940     av_free(c->tempSrc);
 941     av_free(c->deintTemp);
 942     av_free(c->stdQPTable);
 943     av_free(c->nonBQPTable);
 944     av_free(c->forcedQPTable);
 945
 946     memset(c, 0, sizeof(PPContext));
 947
 948     av_free(c);
 949 }
 950
 951 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
 952                      uint8_t * dst[3], const int dstStride[3],
 953                      int width, int height,
 954                      const QP_STORE_T *QP_store,  int QPStride,
 955                      pp_mode *vm,  void *vc, int pict_type)
 956 {
 957     int mbWidth = (width+15)>>4;
 958     int mbHeight= (height+15)>>4;
 959     PPMode *mode = (PPMode*)vm;
 960     PPContext *c = (PPContext*)vc;
 961     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
 962     int absQPStride = FFABS(QPStride);
 963
 964     // c->stride and c->QPStride are always positive
 965     if(c->stride < minStride || c->qpStride < absQPStride)
 966         reallocBuffers(c, width, height,
 967                        FFMAX(minStride, c->stride),
 968                        FFMAX(c->qpStride, absQPStride));
 969
 970     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
 971         int i;
 972         QP_store= c->forcedQPTable;
 973         absQPStride = QPStride = 0;
 974         if(mode->lumMode & FORCE_QUANT)
 975             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
 976         else
 977             for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
 978     }
 979
 980     if(pict_type & PP_PICT_TYPE_QP2){
 981         int i;
 982         const int count= mbHeight * absQPStride;
 983         for(i=0; i<(count>>2); i++){
 984             ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
 985         }
 986         for(i<<=2; i<count; i++){
 987             c->stdQPTable[i] = QP_store[i]>>1;
 988         }
 989         QP_store= c->stdQPTable;
 990         QPStride= absQPStride;
 991     }
 992
 993     if(0){
 994         int x,y;
 995         for(y=0; y<mbHeight; y++){
 996             for(x=0; x<mbWidth; x++){
 997                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
 998             }
 999             av_log(c, AV_LOG_INFO, "\n");
1000         }
1001         av_log(c, AV_LOG_INFO, "\n");
1002     }
1003
1004     if((pict_type&7)!=3){
1005         if (QPStride >= 0){
1006             int i;
1007             const int count= mbHeight * QPStride;
1008             for(i=0; i<(count>>2); i++){
1009                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1010             }
1011             for(i<<=2; i<count; i++){
1012                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1013             }
1014         } else {
1015             int i,j;
1016             for(i=0; i<mbHeight; i++) {
1017                 for(j=0; j<absQPStride; j++) {
1018                     c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1019                 }
1020             }
1021         }
1022     }
1023
1024     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1025            mode->lumMode, mode->chromMode);
1026
1027     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1028                 width, height, QP_store, QPStride, 0, mode, c);
1029
1030     width  = (width )>>c->hChromaSubSample;
1031     height = (height)>>c->vChromaSubSample;
1032
1033     if(mode->chromMode){
1034         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1035                     width, height, QP_store, QPStride, 1, mode, c);
1036         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1037                     width, height, QP_store, QPStride, 2, mode, c);
1038     }
1039     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1040         linecpy(dst[1], src[1], height, srcStride[1]);
1041         linecpy(dst[2], src[2], height, srcStride[2]);
1042     }else{
1043         int y;
1044         for(y=0; y<height; y++){
1045             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1046             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1047         }
1048     }
1049 }