rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
authorMichael Niedermayer <michaelni@gmx.at>
Mon, 15 Oct 2001 03:01:08 +0000 (03:01 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 15 Oct 2001 03:01:08 +0000 (03:01 +0000)
added deinterlace filters (linear interpolate, linear blend, median)
minor cleanups (removed some outcommented stuff)

Originally committed as revision 2204 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/postprocess.c
postproc/postprocess.h
postproc/postprocess_template.c

index 04e73cc..7d6ac4a 100644 (file)
 */
 
 /*
-                       C       MMX     MMX2    3DNow*
+                       C       MMX     MMX2    3DNow
 isVertDC               Ec      Ec
 isVertMinMaxOk         Ec      Ec
-doVertLowPass          E               e       e*
+doVertLowPass          E               e       e
 doVertDefFilter                Ec      Ec      Ec
 isHorizDC              Ec      Ec
 isHorizMinMaxOk                a
-doHorizLowPass         E               a       a*
+doHorizLowPass         E               a       a
 doHorizDefFilter       E       ac      ac
 deRing
-Vertical RKAlgo1       E               a       a*
-Vertical X1            a               E       E*
-Horizontal X1          a               E       E*
+Vertical RKAlgo1       E               a       a
+Vertical X1            a               E       E
+Horizontal X1          a               E       E
+LinIpolDeinterlace     a               E       E*
+LinBlendDeinterlace    a               E       E*
+MedianDeinterlace      a               E
 
 
 * i dont have a 3dnow CPU -> its untested
@@ -55,6 +58,7 @@ make the mainloop more flexible (variable number of blocks at once
 compare the quality & speed of all filters
 implement a few simple deinterlacing filters
 split this huge file
+fix warnings (unused vars, ...)
 ...
 
 Notes:
@@ -63,6 +67,9 @@ Notes:
 
 /*
 Changelog: use the CVS log
+rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
+added deinterlace filters (linear interpolate, linear blend, median)
+minor cleanups (removed some outcommented stuff)
 0.1.3
        bugfixes: last 3 lines not brightness/contrast corrected
                brightness statistics messed up with initial black pic
@@ -194,13 +201,11 @@ static inline void prefetcht2(void *p)
  * Check if the middle 8x8 Block in the given 8x10 block is flat
  */
 static inline int isVertDC(uint8_t src[], int stride){
-//     return true;
        int numEq= 0;
        int y;
        src+= stride; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
        asm volatile(
-//             "int $3 \n\t"
                "pushl %1\n\t"
                "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
                "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
@@ -1577,9 +1582,9 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
 }
 
 /**
- * Do a horizontal low pass filter on the 8x8 block
+ * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
  * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
- * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
+ * useing the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
  */
 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
 {
@@ -1635,14 +1640,6 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
 */
 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
 /*
- 31
- 121
-  121
-   121
-    121
-     121
-      121
-       13
 Implemented    Exact 7-Tap
  9421          A321
  36421         64321
@@ -1654,6 +1651,7 @@ Implemented       Exact 7-Tap
      1249         123A
 
 */
+
 #ifdef HAVE_MMX2
 #define HLP3(i)        "movq " #i "(%%eax), %%mm0                              \n\t"\
                "movq %%mm0, %%mm1                                      \n\t"\
@@ -1680,12 +1678,12 @@ Implemented     Exact 7-Tap
 #define HLP3(i)        "movq " #i "(%%eax), %%mm0                              \n\t"\
                "movq %%mm0, %%mm1                                      \n\t"\
                "movq %%mm0, %%mm2                                      \n\t"\
-               "movq %%mm0, %%mm3                                      \n\t"\
-               "movq %%mm0, %%mm4                                      \n\t"\
+               "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
+               "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
                "psllq $8, %%mm1                                        \n\t"\
                "psrlq $8, %%mm2                                        \n\t"\
-               "pand bm00000001, %%mm3                                 \n\t"\
-               "pand bm10000000, %%mm4                                 \n\t"\
+               "psrlq $24, %%mm3                                       \n\t"\
+               "psllq $56, %%mm4                                       \n\t"\
                "por %%mm3, %%mm1                                       \n\t"\
                "por %%mm4, %%mm2                                       \n\t"\
                PAVGB(%%mm2, %%mm1)\
@@ -1708,7 +1706,80 @@ Implemented      Exact 7-Tap
                "movd %%mm0, 4(%0)                                      \n\t"
 #endif
 
-#define HLP(i) HLP3(i)
+/* uses the 7-Tap Filter: 1112111 */
+#define NEW_HLP(i)\
+               "movq " #i "(%%eax), %%mm0                              \n\t"\
+               "movq %%mm0, %%mm1                                      \n\t"\
+               "movq %%mm0, %%mm2                                      \n\t"\
+               "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
+               "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "psrlq $24, %%mm3                                       \n\t"\
+               "psllq $56, %%mm4                                       \n\t"\
+               "por %%mm3, %%mm1                                       \n\t"\
+               "por %%mm4, %%mm2                                       \n\t"\
+               "movq %%mm1, %%mm5                                      \n\t"\
+               PAVGB(%%mm2, %%mm1)\
+               PAVGB(%%mm1, %%mm0)\
+               "psllq $8, %%mm5                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm5                                       \n\t"\
+               "por %%mm4, %%mm2                                       \n\t"\
+               "movq %%mm5, %%mm1                                      \n\t"\
+               PAVGB(%%mm2, %%mm5)\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm1                                       \n\t"\
+               "por %%mm4, %%mm2                                       \n\t"\
+               PAVGB(%%mm2, %%mm1)\
+               PAVGB(%%mm1, %%mm5)\
+               PAVGB(%%mm5, %%mm0)\
+               "movd %%mm0, (%0)                                       \n\t"\
+               "psrlq $32, %%mm0                                       \n\t"\
+               "movd %%mm0, 4(%0)                                      \n\t"
+
+/* uses the 9-Tap Filter: 112242211 */
+#define NEW_HLP2(i)\
+               "movq " #i "(%%eax), %%mm0                              \n\t" /*0001000*/\
+               "movq %%mm0, %%mm1                                      \n\t" /*0001000*/\
+               "movq %%mm0, %%mm2                                      \n\t" /*0001000*/\
+               "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
+               "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "psrlq $24, %%mm3                                       \n\t"\
+               "psllq $56, %%mm4                                       \n\t"\
+               "por %%mm3, %%mm1                                       \n\t" /*0010000*/\
+               "por %%mm4, %%mm2                                       \n\t" /*0000100*/\
+               "movq %%mm1, %%mm5                                      \n\t" /*0010000*/\
+               PAVGB(%%mm2, %%mm1)                                           /*0010100*/\
+               PAVGB(%%mm1, %%mm0)                                           /*0012100*/\
+               "psllq $8, %%mm5                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm5                                       \n\t" /*0100000*/\
+               "por %%mm4, %%mm2                                       \n\t" /*0000010*/\
+               "movq %%mm5, %%mm1                                      \n\t" /*0100000*/\
+               PAVGB(%%mm2, %%mm5)                                           /*0100010*/\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm1                                       \n\t" /*1000000*/\
+               "por %%mm4, %%mm2                                       \n\t" /*0000001*/\
+               "movq %%mm1, %%mm6                                      \n\t" /*1000000*/\
+               PAVGB(%%mm2, %%mm1)                                           /*1000001*/\
+               "psllq $8, %%mm6                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm6                                       \n\t"/*100000000*/\
+               "por %%mm4, %%mm2                                       \n\t"/*000000001*/\
+               PAVGB(%%mm2, %%mm6)                                          /*100000001*/\
+               PAVGB(%%mm6, %%mm1)                                          /*110000011*/\
+               PAVGB(%%mm1, %%mm5)                                          /*112000211*/\
+               PAVGB(%%mm5, %%mm0)                                          /*112242211*/\
+               "movd %%mm0, (%0)                                       \n\t"\
+               "psrlq $32, %%mm0                                       \n\t"\
+               "movd %%mm0, 4(%0)                                      \n\t"
+
+#define HLP(i) NEW_HLP(i)
 
                HLP(0)
                "addl %1, %0                                            \n\t"
@@ -1828,6 +1899,363 @@ FIND_MIN_MAX(%%ebx, %1, 2)
 #endif
 }
 
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ */
+static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t"
+               "movq (%%eax, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm0                        \n\t"
+               PAVGB(%%mm0, %%mm1)\
+               "movq %%mm1, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%ebx)                            \n\t"
+               "movq (%0, %1, 8), %%mm0                        \n\t"
+               PAVGB(%%mm0, %%mm1)\
+               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[stride]   = (src[0]        + src[stride*2])>>1;
+               src[stride*3] = (src[stride*2] + src[stride*4])>>1;
+               src[stride*5] = (src[stride*4] + src[stride*6])>>1;
+               src[stride*7] = (src[stride*6] + src[stride*8])>>1;
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ */
+static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t"
+               "movq (%%eax, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm0                        \n\t"
+               PAVGB(%%mm0, %%mm1)\
+               "movq %%mm1, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%ebx)                            \n\t"
+               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[stride]   = (src[0]        + src[stride*2])>>1;
+               src[stride*3] = (src[stride*2] + src[stride*4])>>1;
+               src[stride*5] = (src[stride*4] + src[stride*6])>>1;
+               src[stride*7] = src[stride*6];
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will shift the image up by 1 line (FIXME if this is a problem)
+ */
+static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" // L0
+               "movq (%%eax, %1), %%mm1                        \n\t" // L2
+               PAVGB(%%mm1, %%mm0)                                   // L0+L2
+               "movq (%%eax), %%mm2                            \n\t" // L1
+               PAVGB(%%mm2, %%mm0)
+               "movq %%mm0, (%0)                               \n\t"
+               "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
+               PAVGB(%%mm0, %%mm2)                                   // L1+L3
+               PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
+               "movq %%mm2, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t" // L4
+               PAVGB(%%mm2, %%mm1)                                   // L2+L4
+               PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
+               "movq %%mm1, (%%eax, %1)                        \n\t"
+               "movq (%%ebx), %%mm1                            \n\t" // L5
+               PAVGB(%%mm1, %%mm0)                                   // L3+L5
+               PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm0                        \n\t" // L6
+               PAVGB(%%mm0, %%mm2)                                   // L4+L6
+               PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+               "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
+               PAVGB(%%mm2, %%mm1)                                   // L5+L7
+               PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
+               "movq %%mm1, (%%ebx)                            \n\t"
+               "movq (%0, %1, 8), %%mm1                        \n\t" // L8
+               PAVGB(%%mm1, %%mm0)                                   // L6+L8
+               PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
+               "movq %%mm0, (%%ebx, %1)                        \n\t"
+               "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
+               PAVGB(%%mm0, %%mm2)                                   // L7+L9
+               PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
+               "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
+
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
+               src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will shift the image up by 1 line (FIXME if this is a problem)
+ */
+static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" // L0
+               "movq (%%eax, %1), %%mm1                        \n\t" // L2
+               PAVGB(%%mm1, %%mm0)                                   // L0+L2
+               "movq (%%eax), %%mm2                            \n\t" // L1
+               PAVGB(%%mm2, %%mm0)
+               "movq %%mm0, (%0)                               \n\t"
+               "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
+               PAVGB(%%mm0, %%mm2)                                   // L1+L3
+               PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
+               "movq %%mm2, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t" // L4
+               PAVGB(%%mm2, %%mm1)                                   // L2+L4
+               PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
+               "movq %%mm1, (%%eax, %1)                        \n\t"
+               "movq (%%ebx), %%mm1                            \n\t" // L5
+               PAVGB(%%mm1, %%mm0)                                   // L3+L5
+               PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm0                        \n\t" // L6
+               PAVGB(%%mm0, %%mm2)                                   // L4+L6
+               PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+               "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
+               PAVGB(%%mm2, %%mm1)                                   // L5+L7
+               PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
+               "movq %%mm1, (%%ebx)                            \n\t"
+               PAVGB(%%mm2, %%mm0)                                   // L7 + L8
+               "movq %%mm0, (%%ebx, %1)                        \n\t"
+               "movq %%mm0, (%%ebx, %1, 2)                     \n\t"
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
+               src[stride*7] = src[stride*6];
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ */
+static inline void deInterlaceMedian(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" //
+               "movq (%%eax, %1), %%mm2                        \n\t" //
+               "movq (%%eax), %%mm1                            \n\t" //
+               "movq %%mm0, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm2, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm0                            \n\t"
+               "movq %%mm0, (%%eax)                            \n\t"
+
+               "movq (%0, %1, 4), %%mm0                        \n\t" //
+               "movq (%%eax, %1, 2), %%mm1                     \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm0, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm2                            \n\t"
+               "movq %%mm2, (%%eax, %1, 2)                     \n\t"
+
+               "movq (%%ebx), %%mm2                            \n\t" //
+               "movq (%%ebx, %1), %%mm1                        \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm0, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm0                            \n\t" //
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm0, %%mm2                            \n\t"
+               "movq %%mm2, (%%ebx)                            \n\t"
+
+               "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
+               "movq (%0, %1, 8), %%mm0                        \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm0, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm0                            \n\t" //
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm0, %%mm2                            \n\t"
+               "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
+
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       //FIXME
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
+               src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will shift the image up by 1 line (FIXME if this is a problem)
+ */
+static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" //
+               "movq (%%eax, %1), %%mm2                        \n\t" //
+               "movq (%%eax), %%mm1                            \n\t" //
+               "movq %%mm0, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm2, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm0                            \n\t"
+               "movq %%mm0, (%%eax)                            \n\t"
+
+               "movq (%0, %1, 4), %%mm0                        \n\t" //
+               "movq (%%eax, %1, 2), %%mm1                     \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm0, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm2                            \n\t"
+               "movq %%mm2, (%%eax, %1, 2)                     \n\t"
+
+               "movq (%%ebx), %%mm2                            \n\t" //
+               "movq (%%ebx, %1), %%mm1                        \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm0, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm0                            \n\t" //
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm0, %%mm2                            \n\t"
+               "movq %%mm2, (%%ebx)                            \n\t"
+
+               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       //FIXME
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
+               src[stride*7] = src[stride*6];
+               src++;
+       }
+#endif
+}
+
+
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -1841,7 +2269,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
  * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
  * -63 is best quality -1 is worst
  */
-//extern "C"{
 void  postprocess(unsigned char * src[], int src_stride,
                  unsigned char * dst[], int dst_stride,
                  int horizontal_size,   int vertical_size,
@@ -2196,6 +2623,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                blockCopy(vertBlock + dstStride*2, dstStride,
                                        vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
 
+                               if(mode & LINEAR_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateLinear(dstBlock, dstStride);
+                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendLinear(dstBlock, dstStride);
+                               else if(mode & MEDIAN_DEINT_FILTER)
+                                       deInterlaceMedian(dstBlock, dstStride);
+/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateCubic(dstBlock, dstStride);
+                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendCubic(dstBlock, dstStride);
+*/
 
 #ifdef MORE_TIMEING
                                T1= rdtsc();
@@ -2226,9 +2664,22 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
 #endif
                        }
                        else
+                       {
                                blockCopy(vertBlock + dstStride*1, dstStride,
                                        vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
 
+                               if(mode & LINEAR_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
+                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendLinearLastRow(dstBlock, dstStride);
+                               else if(mode & MEDIAN_DEINT_FILTER)
+                                       deInterlaceMedianLastRow(dstBlock, dstStride);
+/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
+                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendCubicLastRow(dstBlock, dstStride);
+*/
+                       }
 
                        if(x - 8 >= 0 && x<width)
                        {
index 81f5435..143ea57 100644 (file)
@@ -22,6 +22,7 @@
 
 #define BLOCK_SIZE 8
 #define TEMP_STRIDE 8
+//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 
 #define V_DEBLOCK      0x01
 #define H_DEBLOCK      0x02
 #define LUM_H_DEBLOCK  H_DEBLOCK               //   2
 #define CHROM_V_DEBLOCK        (V_DEBLOCK<<4)          //  16
 #define CHROM_H_DEBLOCK        (H_DEBLOCK<<4)          //  32
-#define LUM_DERING     DERING                  //   4
-#define CHROM_DERING   (DERING<<4)             //  64
+#define LUM_DERING     DERING                  //   4 (not implemented yet)
+#define CHROM_DERING   (DERING<<4)             //  64 (not implemented yet)
 #define LUM_LEVEL_FIX  LEVEL_FIX               //   8
-//not supported currently
-#define CHROM_LEVEL_FIX        (LEVEL_FIX<<4)          // 128
+#define CHROM_LEVEL_FIX        (LEVEL_FIX<<4)          // 128 (not implemented yet)
 
 // Experimental vertical filters
 #define V_RK1_FILTER   0x0100                  // 256
 #define V_X1_FILTER    0x0200                  // 512
 
 // Experimental horizontal filters
-#define H_RK1_FILTER   0x1000                  // 4096
+#define H_RK1_FILTER   0x1000                  // 4096 (not implemented yet)
 #define H_X1_FILTER    0x2000                  // 8192
 
+//Deinterlacing Filters
+#define DEINTERLACE_FILTER_MASK                0xF0000
+#define        LINEAR_IPOL_DEINT_FILTER        0x10000 // 65536
+#define        LINEAR_BLEND_DEINT_FILTER       0x20000 // 131072
+#define        CUBIC_BLEND_DEINT_FILTER        0x30000 // 196608 (not implemented yet)
+#define        CUBIC_IPOL_DEINT_FILTER         0x40000 // 262144 (not implemented yet)
+#define        MEDIAN_DEINT_FILTER             0x80000 // 524288 
+
+
 #define GET_PP_QUALITY_MAX 6
 
 //#define TIMEING
 
 #define QP_STORE_T int
 
-//#ifdef __cplusplus
-//#include <inttypes.h>
-
-//void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-//     QP_STORE_T QPs[], int QPStride, int isColor, int mode);
-//#endif
-
-//#ifdef __cplusplus
-//extern "C"
-//{
-//#endif
-
 void postprocess(unsigned char * src[], int src_stride,
                  unsigned char * dst[], int dst_stride,
                  int horizontal_size,   int vertical_size,
@@ -72,8 +69,4 @@ void postprocess(unsigned char * src[], int src_stride,
 
 int getPpModeForQuality(int quality);
 
-//#ifdef __cplusplus
-//}
-//#endif
-
 #endif
index 04e73cc..7d6ac4a 100644 (file)
 */
 
 /*
-                       C       MMX     MMX2    3DNow*
+                       C       MMX     MMX2    3DNow
 isVertDC               Ec      Ec
 isVertMinMaxOk         Ec      Ec
-doVertLowPass          E               e       e*
+doVertLowPass          E               e       e
 doVertDefFilter                Ec      Ec      Ec
 isHorizDC              Ec      Ec
 isHorizMinMaxOk                a
-doHorizLowPass         E               a       a*
+doHorizLowPass         E               a       a
 doHorizDefFilter       E       ac      ac
 deRing
-Vertical RKAlgo1       E               a       a*
-Vertical X1            a               E       E*
-Horizontal X1          a               E       E*
+Vertical RKAlgo1       E               a       a
+Vertical X1            a               E       E
+Horizontal X1          a               E       E
+LinIpolDeinterlace     a               E       E*
+LinBlendDeinterlace    a               E       E*
+MedianDeinterlace      a               E
 
 
 * i dont have a 3dnow CPU -> its untested
@@ -55,6 +58,7 @@ make the mainloop more flexible (variable number of blocks at once
 compare the quality & speed of all filters
 implement a few simple deinterlacing filters
 split this huge file
+fix warnings (unused vars, ...)
 ...
 
 Notes:
@@ -63,6 +67,9 @@ Notes:
 
 /*
 Changelog: use the CVS log
+rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
+added deinterlace filters (linear interpolate, linear blend, median)
+minor cleanups (removed some outcommented stuff)
 0.1.3
        bugfixes: last 3 lines not brightness/contrast corrected
                brightness statistics messed up with initial black pic
@@ -194,13 +201,11 @@ static inline void prefetcht2(void *p)
  * Check if the middle 8x8 Block in the given 8x10 block is flat
  */
 static inline int isVertDC(uint8_t src[], int stride){
-//     return true;
        int numEq= 0;
        int y;
        src+= stride; // src points to begin of the 8x8 Block
 #ifdef HAVE_MMX
        asm volatile(
-//             "int $3 \n\t"
                "pushl %1\n\t"
                "movq b7E, %%mm7                                        \n\t" // mm7 = 0x7F
                "movq b7C, %%mm6                                        \n\t" // mm6 = 0x7D
@@ -1577,9 +1582,9 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
 }
 
 /**
- * Do a horizontal low pass filter on the 8x8 block
+ * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
  * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
- * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
+ * useing the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
  */
 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
 {
@@ -1635,14 +1640,6 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
 */
 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
 /*
- 31
- 121
-  121
-   121
-    121
-     121
-      121
-       13
 Implemented    Exact 7-Tap
  9421          A321
  36421         64321
@@ -1654,6 +1651,7 @@ Implemented       Exact 7-Tap
      1249         123A
 
 */
+
 #ifdef HAVE_MMX2
 #define HLP3(i)        "movq " #i "(%%eax), %%mm0                              \n\t"\
                "movq %%mm0, %%mm1                                      \n\t"\
@@ -1680,12 +1678,12 @@ Implemented     Exact 7-Tap
 #define HLP3(i)        "movq " #i "(%%eax), %%mm0                              \n\t"\
                "movq %%mm0, %%mm1                                      \n\t"\
                "movq %%mm0, %%mm2                                      \n\t"\
-               "movq %%mm0, %%mm3                                      \n\t"\
-               "movq %%mm0, %%mm4                                      \n\t"\
+               "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
+               "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
                "psllq $8, %%mm1                                        \n\t"\
                "psrlq $8, %%mm2                                        \n\t"\
-               "pand bm00000001, %%mm3                                 \n\t"\
-               "pand bm10000000, %%mm4                                 \n\t"\
+               "psrlq $24, %%mm3                                       \n\t"\
+               "psllq $56, %%mm4                                       \n\t"\
                "por %%mm3, %%mm1                                       \n\t"\
                "por %%mm4, %%mm2                                       \n\t"\
                PAVGB(%%mm2, %%mm1)\
@@ -1708,7 +1706,80 @@ Implemented      Exact 7-Tap
                "movd %%mm0, 4(%0)                                      \n\t"
 #endif
 
-#define HLP(i) HLP3(i)
+/* uses the 7-Tap Filter: 1112111 */
+#define NEW_HLP(i)\
+               "movq " #i "(%%eax), %%mm0                              \n\t"\
+               "movq %%mm0, %%mm1                                      \n\t"\
+               "movq %%mm0, %%mm2                                      \n\t"\
+               "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
+               "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "psrlq $24, %%mm3                                       \n\t"\
+               "psllq $56, %%mm4                                       \n\t"\
+               "por %%mm3, %%mm1                                       \n\t"\
+               "por %%mm4, %%mm2                                       \n\t"\
+               "movq %%mm1, %%mm5                                      \n\t"\
+               PAVGB(%%mm2, %%mm1)\
+               PAVGB(%%mm1, %%mm0)\
+               "psllq $8, %%mm5                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm5                                       \n\t"\
+               "por %%mm4, %%mm2                                       \n\t"\
+               "movq %%mm5, %%mm1                                      \n\t"\
+               PAVGB(%%mm2, %%mm5)\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm1                                       \n\t"\
+               "por %%mm4, %%mm2                                       \n\t"\
+               PAVGB(%%mm2, %%mm1)\
+               PAVGB(%%mm1, %%mm5)\
+               PAVGB(%%mm5, %%mm0)\
+               "movd %%mm0, (%0)                                       \n\t"\
+               "psrlq $32, %%mm0                                       \n\t"\
+               "movd %%mm0, 4(%0)                                      \n\t"
+
+/* uses the 9-Tap Filter: 112242211 */
+#define NEW_HLP2(i)\
+               "movq " #i "(%%eax), %%mm0                              \n\t" /*0001000*/\
+               "movq %%mm0, %%mm1                                      \n\t" /*0001000*/\
+               "movq %%mm0, %%mm2                                      \n\t" /*0001000*/\
+               "movd -4(%0), %%mm3                                     \n\t" /*0001000*/\
+               "movd 8(%0), %%mm4                                      \n\t" /*0001000*/\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "psrlq $24, %%mm3                                       \n\t"\
+               "psllq $56, %%mm4                                       \n\t"\
+               "por %%mm3, %%mm1                                       \n\t" /*0010000*/\
+               "por %%mm4, %%mm2                                       \n\t" /*0000100*/\
+               "movq %%mm1, %%mm5                                      \n\t" /*0010000*/\
+               PAVGB(%%mm2, %%mm1)                                           /*0010100*/\
+               PAVGB(%%mm1, %%mm0)                                           /*0012100*/\
+               "psllq $8, %%mm5                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm5                                       \n\t" /*0100000*/\
+               "por %%mm4, %%mm2                                       \n\t" /*0000010*/\
+               "movq %%mm5, %%mm1                                      \n\t" /*0100000*/\
+               PAVGB(%%mm2, %%mm5)                                           /*0100010*/\
+               "psllq $8, %%mm1                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm1                                       \n\t" /*1000000*/\
+               "por %%mm4, %%mm2                                       \n\t" /*0000001*/\
+               "movq %%mm1, %%mm6                                      \n\t" /*1000000*/\
+               PAVGB(%%mm2, %%mm1)                                           /*1000001*/\
+               "psllq $8, %%mm6                                        \n\t"\
+               "psrlq $8, %%mm2                                        \n\t"\
+               "por %%mm3, %%mm6                                       \n\t"/*100000000*/\
+               "por %%mm4, %%mm2                                       \n\t"/*000000001*/\
+               PAVGB(%%mm2, %%mm6)                                          /*100000001*/\
+               PAVGB(%%mm6, %%mm1)                                          /*110000011*/\
+               PAVGB(%%mm1, %%mm5)                                          /*112000211*/\
+               PAVGB(%%mm5, %%mm0)                                          /*112242211*/\
+               "movd %%mm0, (%0)                                       \n\t"\
+               "psrlq $32, %%mm0                                       \n\t"\
+               "movd %%mm0, 4(%0)                                      \n\t"
+
+#define HLP(i) NEW_HLP(i)
 
                HLP(0)
                "addl %1, %0                                            \n\t"
@@ -1828,6 +1899,363 @@ FIND_MIN_MAX(%%ebx, %1, 2)
 #endif
 }
 
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ */
+static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t"
+               "movq (%%eax, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm0                        \n\t"
+               PAVGB(%%mm0, %%mm1)\
+               "movq %%mm1, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%ebx)                            \n\t"
+               "movq (%0, %1, 8), %%mm0                        \n\t"
+               PAVGB(%%mm0, %%mm1)\
+               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[stride]   = (src[0]        + src[stride*2])>>1;
+               src[stride*3] = (src[stride*2] + src[stride*4])>>1;
+               src[stride*5] = (src[stride*4] + src[stride*6])>>1;
+               src[stride*7] = (src[stride*6] + src[stride*8])>>1;
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ */
+static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t"
+               "movq (%%eax, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm0                        \n\t"
+               PAVGB(%%mm0, %%mm1)\
+               "movq %%mm1, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm1                        \n\t"
+               PAVGB(%%mm1, %%mm0)\
+               "movq %%mm0, (%%ebx)                            \n\t"
+               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[stride]   = (src[0]        + src[stride*2])>>1;
+               src[stride*3] = (src[stride*2] + src[stride*4])>>1;
+               src[stride*5] = (src[stride*4] + src[stride*6])>>1;
+               src[stride*7] = src[stride*6];
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ * will shift the image up by 1 line (FIXME if this is a problem)
+ */
+static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" // L0
+               "movq (%%eax, %1), %%mm1                        \n\t" // L2
+               PAVGB(%%mm1, %%mm0)                                   // L0+L2
+               "movq (%%eax), %%mm2                            \n\t" // L1
+               PAVGB(%%mm2, %%mm0)
+               "movq %%mm0, (%0)                               \n\t"
+               "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
+               PAVGB(%%mm0, %%mm2)                                   // L1+L3
+               PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
+               "movq %%mm2, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t" // L4
+               PAVGB(%%mm2, %%mm1)                                   // L2+L4
+               PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
+               "movq %%mm1, (%%eax, %1)                        \n\t"
+               "movq (%%ebx), %%mm1                            \n\t" // L5
+               PAVGB(%%mm1, %%mm0)                                   // L3+L5
+               PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm0                        \n\t" // L6
+               PAVGB(%%mm0, %%mm2)                                   // L4+L6
+               PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+               "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
+               PAVGB(%%mm2, %%mm1)                                   // L5+L7
+               PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
+               "movq %%mm1, (%%ebx)                            \n\t"
+               "movq (%0, %1, 8), %%mm1                        \n\t" // L8
+               PAVGB(%%mm1, %%mm0)                                   // L6+L8
+               PAVGB(%%mm2, %%mm0)                                   // 2L7 + L6 + L8
+               "movq %%mm0, (%%ebx, %1)                        \n\t"
+               "movq (%%ebx, %1, 4), %%mm0                     \n\t" // L9
+               PAVGB(%%mm0, %%mm2)                                   // L7+L9
+               PAVGB(%%mm1, %%mm2)                                   // 2L8 + L7 + L9
+               "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
+
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
+               src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will shift the image up by 1 line (FIXME if this is a problem)
+ */
+static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" // L0
+               "movq (%%eax, %1), %%mm1                        \n\t" // L2
+               PAVGB(%%mm1, %%mm0)                                   // L0+L2
+               "movq (%%eax), %%mm2                            \n\t" // L1
+               PAVGB(%%mm2, %%mm0)
+               "movq %%mm0, (%0)                               \n\t"
+               "movq (%%eax, %1, 2), %%mm0                     \n\t" // L3
+               PAVGB(%%mm0, %%mm2)                                   // L1+L3
+               PAVGB(%%mm1, %%mm2)                                   // 2L2 + L1 + L3
+               "movq %%mm2, (%%eax)                            \n\t"
+               "movq (%0, %1, 4), %%mm2                        \n\t" // L4
+               PAVGB(%%mm2, %%mm1)                                   // L2+L4
+               PAVGB(%%mm0, %%mm1)                                   // 2L3 + L2 + L4
+               "movq %%mm1, (%%eax, %1)                        \n\t"
+               "movq (%%ebx), %%mm1                            \n\t" // L5
+               PAVGB(%%mm1, %%mm0)                                   // L3+L5
+               PAVGB(%%mm2, %%mm0)                                   // 2L4 + L3 + L5
+               "movq %%mm0, (%%eax, %1, 2)                     \n\t"
+               "movq (%%ebx, %1), %%mm0                        \n\t" // L6
+               PAVGB(%%mm0, %%mm2)                                   // L4+L6
+               PAVGB(%%mm1, %%mm2)                                   // 2L5 + L4 + L6
+               "movq %%mm2, (%0, %1, 4)                        \n\t"
+               "movq (%%ebx, %1, 2), %%mm2                     \n\t" // L7
+               PAVGB(%%mm2, %%mm1)                                   // L5+L7
+               PAVGB(%%mm0, %%mm1)                                   // 2L6 + L5 + L7
+               "movq %%mm1, (%%ebx)                            \n\t"
+               PAVGB(%%mm2, %%mm0)                                   // L7 + L8
+               "movq %%mm0, (%%ebx, %1)                        \n\t"
+               "movq %%mm0, (%%ebx, %1, 2)                     \n\t"
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
+               src[stride*7] = src[stride*6];
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
+ */
+static inline void deInterlaceMedian(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" //
+               "movq (%%eax, %1), %%mm2                        \n\t" //
+               "movq (%%eax), %%mm1                            \n\t" //
+               "movq %%mm0, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm2, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm0                            \n\t"
+               "movq %%mm0, (%%eax)                            \n\t"
+
+               "movq (%0, %1, 4), %%mm0                        \n\t" //
+               "movq (%%eax, %1, 2), %%mm1                     \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm0, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm2                            \n\t"
+               "movq %%mm2, (%%eax, %1, 2)                     \n\t"
+
+               "movq (%%ebx), %%mm2                            \n\t" //
+               "movq (%%ebx, %1), %%mm1                        \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm0, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm0                            \n\t" //
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm0, %%mm2                            \n\t"
+               "movq %%mm2, (%%ebx)                            \n\t"
+
+               "movq (%%ebx, %1, 2), %%mm2                     \n\t" //
+               "movq (%0, %1, 8), %%mm0                        \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm0, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm0                            \n\t" //
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm0, %%mm2                            \n\t"
+               "movq %%mm2, (%%ebx, %1, 2)                     \n\t"
+
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       //FIXME
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
+               src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
+               src++;
+       }
+#endif
+}
+
+/**
+ * Deinterlaces the given block
+ * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
+ * will shift the image up by 1 line (FIXME if this is a problem)
+ */
+static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
+{
+#if defined (HAVE_MMX2)
+       asm volatile(
+               "leal (%0, %1), %%eax                           \n\t"
+               "leal (%%eax, %1, 4), %%ebx                     \n\t"
+//     0       1       2       3       4       5       6       7       8       9
+//     %0      eax     eax+%1  eax+2%1 %0+4%1  ebx     ebx+%1  ebx+2%1 %0+8%1  ebx+4%1
+
+               "movq (%0), %%mm0                               \n\t" //
+               "movq (%%eax, %1), %%mm2                        \n\t" //
+               "movq (%%eax), %%mm1                            \n\t" //
+               "movq %%mm0, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm2, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm0                            \n\t"
+               "movq %%mm0, (%%eax)                            \n\t"
+
+               "movq (%0, %1, 4), %%mm0                        \n\t" //
+               "movq (%%eax, %1, 2), %%mm1                     \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm1, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm1                            \n\t" //
+               "pmaxub %%mm0, %%mm1                            \n\t" //
+               "pminub %%mm1, %%mm2                            \n\t"
+               "movq %%mm2, (%%eax, %1, 2)                     \n\t"
+
+               "movq (%%ebx), %%mm2                            \n\t" //
+               "movq (%%ebx, %1), %%mm1                        \n\t" //
+               "movq %%mm2, %%mm3                              \n\t"
+               "pmaxub %%mm0, %%mm2                            \n\t" //
+               "pminub %%mm3, %%mm0                            \n\t" //
+               "pmaxub %%mm1, %%mm0                            \n\t" //
+               "pminub %%mm0, %%mm2                            \n\t"
+               "movq %%mm2, (%%ebx)                            \n\t"
+
+               "movq %%mm1, (%%ebx, %1, 2)                     \n\t"
+
+               : : "r" (src), "r" (stride)
+               : "%eax", "%ebx"
+       );
+#else
+       //FIXME
+       int x;
+       for(x=0; x<8; x++)
+       {
+               src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
+               src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
+               src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
+               src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
+               src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
+               src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
+               src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
+               src[stride*7] = src[stride*6];
+               src++;
+       }
+#endif
+}
+
+
 #ifdef HAVE_ODIVX_POSTPROCESS
 #include "../opendivx/postprocess.h"
 int use_old_pp=0;
@@ -1841,7 +2269,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
  * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
  * -63 is best quality -1 is worst
  */
-//extern "C"{
 void  postprocess(unsigned char * src[], int src_stride,
                  unsigned char * dst[], int dst_stride,
                  int horizontal_size,   int vertical_size,
@@ -2196,6 +2623,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                blockCopy(vertBlock + dstStride*2, dstStride,
                                        vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
 
+                               if(mode & LINEAR_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateLinear(dstBlock, dstStride);
+                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendLinear(dstBlock, dstStride);
+                               else if(mode & MEDIAN_DEINT_FILTER)
+                                       deInterlaceMedian(dstBlock, dstStride);
+/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateCubic(dstBlock, dstStride);
+                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendCubic(dstBlock, dstStride);
+*/
 
 #ifdef MORE_TIMEING
                                T1= rdtsc();
@@ -2226,9 +2664,22 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
 #endif
                        }
                        else
+                       {
                                blockCopy(vertBlock + dstStride*1, dstStride,
                                        vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
 
+                               if(mode & LINEAR_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
+                               else if(mode & LINEAR_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendLinearLastRow(dstBlock, dstStride);
+                               else if(mode & MEDIAN_DEINT_FILTER)
+                                       deInterlaceMedianLastRow(dstBlock, dstStride);
+/*                             else if(mode & CUBIC_IPOL_DEINT_FILTER)
+                                       deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
+                               else if(mode & CUBIC_BLEND_DEINT_FILTER)
+                                       deInterlaceBlendCubicLastRow(dstBlock, dstStride);
+*/
+                       }
 
                        if(x - 8 >= 0 && x<width)
                        {