temp denoiser:
authorMichael Niedermayer <michaelni@gmx.at>
Wed, 14 Nov 2001 11:51:36 +0000 (11:51 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Wed, 14 Nov 2001 11:51:36 +0000 (11:51 +0000)
 bugfix?
 averageing noise over the surrounding blocks

Originally committed as revision 2900 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/postprocess.c
postproc/postprocess_template.c

index 8ad5e7b..0d20ad4 100644 (file)
@@ -153,6 +153,7 @@ static uint64_t __attribute__((aligned(8))) temp5=0;
 static uint64_t __attribute__((aligned(8))) pQPb=0;
 static uint64_t __attribute__((aligned(8))) pQPb2=0;
 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
+static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
 #else
 static uint64_t packedYOffset= 0x0000000000000000LL;
 static uint64_t packedYScale=  0x0100010001000100LL;
@@ -2596,7 +2597,7 @@ static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
 //static int test=0;
 
 static void inline tempNoiseReducer(uint8_t *src, int stride,
-                                   uint8_t *tempBlured, int *maxNoise)
+                                   uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
 {
 #define FAST_L2_DIFF
 //#define L1_DIFF //u should change the thresholds too if u try that one
@@ -2694,12 +2695,23 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
                "psrlq $32, %%mm0                               \n\t"
                "paddd %%mm0, %%mm4                             \n\t"
                "movd %%mm4, %%ecx                              \n\t"
+               "shll $2, %%ecx                                 \n\t"
+               "movl %3, %%ebx                                 \n\t"
+               "addl -4(%%ebx), %%ecx                          \n\t"
+               "addl 4(%%ebx), %%ecx                           \n\t"
+               "addl -1024(%%ebx), %%ecx                       \n\t"
+               "addl $4, %%ecx                                 \n\t"
+               "addl 1024(%%ebx), %%ecx                        \n\t"
+               "shrl $3, %%ecx                                 \n\t"
+               "movl %%ecx, (%%ebx)                            \n\t"
+               "leal (%%eax, %2, 2), %%ebx                     \n\t" // 5*stride
+
 //             "movl %3, %%ecx                         \n\t"
 //             "movl %%ecx, test                               \n\t"
 //             "jmp 4f \n\t"
-               "cmpl %4, %%ecx                         \n\t"
+               "cmpl 4+maxTmpNoise, %%ecx                      \n\t"
                " jb 2f                                         \n\t"
-               "cmpl %5, %%ecx                         \n\t"
+               "cmpl 8+maxTmpNoise, %%ecx                      \n\t"
                " jb 1f                                         \n\t"
 
                "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
@@ -2758,7 +2770,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
                "jmp 4f                                         \n\t"
 
                "2:                                             \n\t"
-               "cmpl %3, %%ecx                                 \n\t"
+               "cmpl maxTmpNoise, %%ecx                        \n\t"
                " jb 3f                                         \n\t"
 
                "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
@@ -2875,8 +2887,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 
                "4:                                             \n\t"
 
-               :: "r" (src), "r" (tempBlured), "r"(stride),
-                  "m"(maxNoise[0]), "m"(maxNoise[1]), "m"(maxNoise[2])
+               :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
                : "%eax", "%ebx", "%ecx", "memory"
                );
 //printf("%d\n", test);
@@ -2884,6 +2895,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
        int y;
        int d=0;
        int sysd=0;
+       int i;
 
        for(y=0; y<8; y++)
        {
@@ -2900,6 +2912,16 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
                        sysd+= d1;
                }
        }
+       i=d;
+       d=      (
+               4*d
+               +(*(tempBluredPast-256))
+               +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
+               +(*(tempBluredPast+256))
+               +4)>>3;
+       *tempBluredPast=i;
+//     ((*tempBluredPast)*3 + d + 2)>>2;
+
 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
 /*
 Switch between
@@ -3462,6 +3484,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
 
        /* Temporal noise reducing buffers */
        static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
+       static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
 
 #ifdef PP_FUNNY_STRIDE
        uint8_t *dstBlockPtrBackup;
@@ -3476,6 +3499,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
        sumTime= rdtsc();
 #endif
 //mode= 0x7F;
+#ifdef HAVE_MMX
+       maxTmpNoise[0]= ppMode->maxTmpNoise[0];
+       maxTmpNoise[1]= ppMode->maxTmpNoise[1];
+       maxTmpNoise[2]= ppMode->maxTmpNoise[2];
+#endif
 
        if(tempDst==NULL)
        {
@@ -3491,8 +3519,10 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                //FIXME works only as long as the size doesnt increase
                //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
                tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
+               tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
 
                memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
+               memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
        }
 
        if(!yHistogram)
@@ -3882,6 +3912,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                {
                                        tempNoiseReducer(dstBlock-8, stride,
                                                tempBlured[isColor] + y*dstStride + x,
+                                               tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                                                ppMode->maxTmpNoise);
                                }
                        }
@@ -3920,6 +3951,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                {
                        tempNoiseReducer(dstBlock-8, dstStride,
                                tempBlured[isColor] + y*dstStride + x,
+                               tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                                ppMode->maxTmpNoise);
                }
 
index 8ad5e7b..0d20ad4 100644 (file)
@@ -153,6 +153,7 @@ static uint64_t __attribute__((aligned(8))) temp5=0;
 static uint64_t __attribute__((aligned(8))) pQPb=0;
 static uint64_t __attribute__((aligned(8))) pQPb2=0;
 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code
+static uint32_t __attribute__((aligned(4))) maxTmpNoise[4];
 #else
 static uint64_t packedYOffset= 0x0000000000000000LL;
 static uint64_t packedYScale=  0x0100010001000100LL;
@@ -2596,7 +2597,7 @@ static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
 //static int test=0;
 
 static void inline tempNoiseReducer(uint8_t *src, int stride,
-                                   uint8_t *tempBlured, int *maxNoise)
+                                   uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
 {
 #define FAST_L2_DIFF
 //#define L1_DIFF //u should change the thresholds too if u try that one
@@ -2694,12 +2695,23 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
                "psrlq $32, %%mm0                               \n\t"
                "paddd %%mm0, %%mm4                             \n\t"
                "movd %%mm4, %%ecx                              \n\t"
+               "shll $2, %%ecx                                 \n\t"
+               "movl %3, %%ebx                                 \n\t"
+               "addl -4(%%ebx), %%ecx                          \n\t"
+               "addl 4(%%ebx), %%ecx                           \n\t"
+               "addl -1024(%%ebx), %%ecx                       \n\t"
+               "addl $4, %%ecx                                 \n\t"
+               "addl 1024(%%ebx), %%ecx                        \n\t"
+               "shrl $3, %%ecx                                 \n\t"
+               "movl %%ecx, (%%ebx)                            \n\t"
+               "leal (%%eax, %2, 2), %%ebx                     \n\t" // 5*stride
+
 //             "movl %3, %%ecx                         \n\t"
 //             "movl %%ecx, test                               \n\t"
 //             "jmp 4f \n\t"
-               "cmpl %4, %%ecx                         \n\t"
+               "cmpl 4+maxTmpNoise, %%ecx                      \n\t"
                " jb 2f                                         \n\t"
-               "cmpl %5, %%ecx                         \n\t"
+               "cmpl 8+maxTmpNoise, %%ecx                      \n\t"
                " jb 1f                                         \n\t"
 
                "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
@@ -2758,7 +2770,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
                "jmp 4f                                         \n\t"
 
                "2:                                             \n\t"
-               "cmpl %3, %%ecx                                 \n\t"
+               "cmpl maxTmpNoise, %%ecx                        \n\t"
                " jb 3f                                         \n\t"
 
                "leal (%%ebx, %2, 2), %%ecx                     \n\t" // 7*stride
@@ -2875,8 +2887,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
 
                "4:                                             \n\t"
 
-               :: "r" (src), "r" (tempBlured), "r"(stride),
-                  "m"(maxNoise[0]), "m"(maxNoise[1]), "m"(maxNoise[2])
+               :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
                : "%eax", "%ebx", "%ecx", "memory"
                );
 //printf("%d\n", test);
@@ -2884,6 +2895,7 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
        int y;
        int d=0;
        int sysd=0;
+       int i;
 
        for(y=0; y<8; y++)
        {
@@ -2900,6 +2912,16 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
                        sysd+= d1;
                }
        }
+       i=d;
+       d=      (
+               4*d
+               +(*(tempBluredPast-256))
+               +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
+               +(*(tempBluredPast+256))
+               +4)>>3;
+       *tempBluredPast=i;
+//     ((*tempBluredPast)*3 + d + 2)>>2;
+
 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
 /*
 Switch between
@@ -3462,6 +3484,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
 
        /* Temporal noise reducing buffers */
        static uint8_t *tempBlured[3]= {NULL,NULL,NULL};
+       static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL};
 
 #ifdef PP_FUNNY_STRIDE
        uint8_t *dstBlockPtrBackup;
@@ -3476,6 +3499,11 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
        sumTime= rdtsc();
 #endif
 //mode= 0x7F;
+#ifdef HAVE_MMX
+       maxTmpNoise[0]= ppMode->maxTmpNoise[0];
+       maxTmpNoise[1]= ppMode->maxTmpNoise[1];
+       maxTmpNoise[2]= ppMode->maxTmpNoise[2];
+#endif
 
        if(tempDst==NULL)
        {
@@ -3491,8 +3519,10 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                //FIXME works only as long as the size doesnt increase
                //Note:the +17*1024 is just there so i dont have to worry about r/w over te end
                tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024);
+               tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024);
 
                memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024);
+               memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024);
        }
 
        if(!yHistogram)
@@ -3882,6 +3912,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                                {
                                        tempNoiseReducer(dstBlock-8, stride,
                                                tempBlured[isColor] + y*dstStride + x,
+                                               tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                                                ppMode->maxTmpNoise);
                                }
                        }
@@ -3920,6 +3951,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                {
                        tempNoiseReducer(dstBlock-8, dstStride,
                                tempBlured[isColor] + y*dstStride + x,
+                               tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                                ppMode->maxTmpNoise);
                }