From 1916a3b075ce5a71e69c42982025e252407488d2 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Sebastian=20Dr=C3=B6ge?= <slomo@circular-chaos.org>
Date: Tue, 24 Jun 2008 09:10:46 +0000
Subject: [PATCH] [MOVED FROM BAD 09/56] gst/deinterlace2/tvtime/greedy.c: Fix
 the C implementation to produce correct results and optimize the

Original commit message from CVS:
* gst/deinterlace2/tvtime/greedy.c:
(deinterlace_greedy_packed422_scanline_c),
(deinterlace_greedy_packed422_scanline_mmxext),
(deinterlace_greedy_packed422_scanline):
Fix the C implementation to produce correct results and optimize the
MMXEXT implementation.
Handle odd widths and don't read over array boundaries in the MMXEXT
implementation.
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
(deinterlace_line_mmx), (deinterlace_scanline_vfir):
Fix a small rounding bug in the MMX implementation, the MMX
implementation doesn't actually need MMXEXT instructions so don't mark
it as such.
Handle odd widths in both implementations.
---
 gst/deinterlace2/tvtime/greedy.c | 158 ++++++++++++++++++---------------------
 gst/deinterlace2/tvtime/vfir.c   |  77 ++++++++++---------
 2 files changed, 112 insertions(+), 123 deletions(-)

diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c
index c25af03..66b8799 100644
--- a/gst/deinterlace2/tvtime/greedy.c
+++ b/gst/deinterlace2/tvtime/greedy.c
@@ -60,26 +60,65 @@ copy_scanline (GstDeinterlace2 * object,
   blit_packed422_scanline (output, data->m1, object->frame_width);
 }
 
-static int GreedyMaxComb = 15;
+static const int GreedyMaxComb = 15;
 
-#ifdef HAVE_CPU_I386
-#include "mmx.h"
-#include "sse.h"
-static void
-deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
-    deinterlace_scanline_data_t * data, uint8_t * output)
+static inline void
+deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
+    uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+    int width)
 {
-  mmx_t MaxComb;
+  int avg, l2_diff, lp2_diff, max, min, best;
+
+  // L2 == m0
+  // L1 == t1
+  // L3 == b1
+  // LP2 == m2
+
+  while (width--) {
+    avg = (*t1 + *b1) / 2;
+
+    l2_diff = ABS (*m0 - avg);
+    lp2_diff = ABS (*m2 - avg);
+
+    if (l2_diff > lp2_diff)
+      best = *m2;
+    else
+      best = *m0;
+
+    max = MAX (*t1, *b1);
+    min = MIN (*t1, *b1);
+
+    if (max < 256 - GreedyMaxComb)
+      max += GreedyMaxComb;
+    else
+      max = 255;
 
-  uint8_t *m0 = data->m0;
+    if (min > GreedyMaxComb)
+      min -= GreedyMaxComb;
+    else
+      min = 0;
 
-  uint8_t *t1 = data->t1;
+    *output = CLAMP (best, min, max);
 
-  uint8_t *b1 = data->b1;
+    // Advance to the next set of pixels.
+    output += 1;
+    m0 += 1;
+    t1 += 1;
+    b1 += 1;
+    m2 += 1;
+  }
+}
 
-  uint8_t *m2 = data->m2;
+#ifdef HAVE_CPU_I386
+#include "mmx.h"
+#include "sse.h"
 
-  int width = object->frame_width;
+static void
+deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
+    uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+    int width)
+{
+  mmx_t MaxComb;
 
   // How badly do we let it weave? 0-255
   MaxComb.ub[0] = GreedyMaxComb;
@@ -96,8 +135,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
   // L3 == b1
   // LP2 == m2
 
-  width /= 4;
-  while (width--) {
+  for (; width > 7; width -= 8) {
     movq_m2r (*t1, mm1);        // L1
     movq_m2r (*m0, mm2);        // L2
     movq_m2r (*b1, mm3);        // L3
@@ -107,15 +145,12 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
     movq_r2r (mm1, mm4);        // L1
     pavgb_r2r (mm3, mm4);       // (L1 + L3)/2
 
-
     // get abs value of possible L2 comb
     movq_r2r (mm2, mm7);        // L2
     psubusb_r2r (mm4, mm7);     // L2 - avg
     movq_r2r (mm4, mm5);        // avg
     psubusb_r2r (mm2, mm5);     // avg - L2
     por_r2r (mm7, mm5);         // abs(avg-L2)
-    movq_r2r (mm4, mm6);        // copy of avg for later
-
 
     // get abs value of possible LP2 comb
     movq_r2r (mm0, mm7);        // LP2
@@ -125,7 +160,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
 
     // use L2 or LP2 depending upon which makes smaller comb
     psubusb_r2r (mm5, mm4);     // see if it goes to zero
-    psubusb_r2r (mm5, mm5);     // 0
+    pxor_r2r (mm5, mm5);        // 0
     pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
     pcmpeqb_r2r (mm4, mm5);     // opposite of mm4
 
@@ -140,27 +175,19 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
     // detail than a boring oversmoothed clip.
 
     movq_r2r (mm1, mm2);        // copy L1
-    psubusb_r2r (mm3, mm2);     // - L3, with saturation
-    paddusb_r2r (mm3, mm2);     // now = Max(L1,L3)
+    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)
 
-    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
-    psubusb_r2r (mm1, mm7);     // - L1 
-    paddusb_r2r (mm7, mm3);     // add, may sat at fff..
-    psubusb_r2r (mm7, mm3);     // now = Min(L1,L3)
+    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)
 
     // allow the value to be above the high or below the low by amt of MaxComb
     paddusb_m2r (MaxComb, mm2); // increase max by diff
     psubusb_m2r (MaxComb, mm3); // lower min by diff
 
-    psubusb_r2r (mm3, mm4);     // best - Min
-    paddusb_r2r (mm3, mm4);     // now = Max(best,Min(L1,L3)
 
-    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
-    psubusb_r2r (mm4, mm7);     // - Max(best,Min(best,L3) 
-    paddusb_r2r (mm7, mm2);     // add may sat at FFF..
-    psubusb_r2r (mm7, mm2);     // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
+    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
 
-    movntq_r2m (mm2, *output);  // move in our clipped best
+    movq_r2m (mm2, *output);    // move in our clipped best
 
     // Advance to the next set of pixels.
     output += 8;
@@ -171,72 +198,29 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
   }
   sfence ();
   emms ();
-}
-#endif
-
-static void
-deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
-    deinterlace_scanline_data_t * data, uint8_t * output)
-{
-  uint8_t *m0 = data->m0;
-
-  uint8_t *t1 = data->t1;
-
-  uint8_t *b1 = data->b1;
-
-  uint8_t *m2 = data->m2;
-
-  int width = 2 * object->frame_width;
-
-  uint16_t avg, l2_diff, lp2_diff, max, min, best;
-
-  // L2 == m0
-  // L1 == t1
-  // L3 == b1
-  // LP2 == m2
-
-  while (width--) {
-    avg = (*t1 + *b1) / 2;
-
-    l2_diff = ABS (*m0 - avg);
-    lp2_diff = ABS (*m2 - avg);
-
-    if (l2_diff > lp2_diff)
-      best = *m2;
-    else
-      best = *m0;
-
-    max = MAX (*t1, *b1);
-    min = MIN (*t1, *b1);
-
-    if (max < 256 - GreedyMaxComb)
-      max += GreedyMaxComb;
-    if (min > GreedyMaxComb)
-      min -= GreedyMaxComb;
-
-    *output = MIN (MAX (best, min), max);
 
-    // Advance to the next set of pixels.
-    output += 1;
-    m0 += 1;
-    t1 += 1;
-    b1 += 1;
-    m2 += 1;
-  }
+  if (width > 0)
+    deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
+        width);
 }
 
+#endif
+
 static void
 deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
     deinterlace_scanline_data_t * data, uint8_t * output)
 {
 #ifdef HAVE_CPU_I386
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
-    deinterlace_greedy_packed422_scanline_sse (object, data, output);
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+    deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
+        data->b1, data->m2, output, 2 * object->frame_width);
   } else {
-    deinterlace_greedy_packed422_scanline_c (object, data, output);
+    deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
+        data->b1, data->m2, output, 2 * object->frame_width);
   }
 #else
-  deinterlace_greedy_packed422_scanline_c (object, data, output);
+  deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
+      data->m2, output, 2 * object->frame_width);
 #endif
 }
 
diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c
index f32be65..479ee44 100644
--- a/gst/deinterlace2/tvtime/vfir.c
+++ b/gst/deinterlace2/tvtime/vfir.c
@@ -49,10 +49,36 @@
  * filter taps here are: [-1 4 2 4 -1].
  */
 
+/**
+  * C implementation.
+  */
+static inline void
+deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
+    uint8_t * lum_m3, uint8_t * lum_m2,
+    uint8_t * lum_m1, uint8_t * lum, int size)
+{
+  int sum;
+
+  for (; size > 0; size--) {
+    sum = -lum_m4[0];
+    sum += lum_m3[0] << 2;
+    sum += lum_m2[0] << 1;
+    sum += lum_m1[0] << 2;
+    sum += -lum[0];
+    dst[0] = (sum + 4) >> 3;    // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
+    lum_m4++;
+    lum_m3++;
+    lum_m2++;
+    lum_m1++;
+    lum++;
+    dst++;
+  }
+}
+
 #ifdef HAVE_CPU_I386
 #include "mmx.h"
 static void
-deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
+deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
     uint8_t * lum_m3, uint8_t * lum_m2,
     uint8_t * lum_m1, uint8_t * lum, int size)
 {
@@ -63,14 +89,15 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
   rounder.uw[2] = 4;
   rounder.uw[3] = 4;
   pxor_r2r (mm7, mm7);
-  movq_m2r (rounder, mm6);
+  movd_m2r (rounder, mm6);
+  punpcklbw_r2r (mm7, mm6);
 
   for (; size > 3; size -= 4) {
-    movd_m2r (lum_m4[0], mm0);
-    movd_m2r (lum_m3[0], mm1);
-    movd_m2r (lum_m2[0], mm2);
-    movd_m2r (lum_m1[0], mm3);
-    movd_m2r (lum[0], mm4);
+    movd_m2r (*lum_m4, mm0);
+    movd_m2r (*lum_m3, mm1);
+    movd_m2r (*lum_m2, mm2);
+    movd_m2r (*lum_m1, mm3);
+    movd_m2r (*lum, mm4);
     punpcklbw_r2r (mm7, mm0);
     punpcklbw_r2r (mm7, mm1);
     punpcklbw_r2r (mm7, mm2);
@@ -85,7 +112,7 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
     psubusw_r2r (mm0, mm1);
     psrlw_i2r (3, mm1);         // 3
     packuswb_r2r (mm7, mm1);
-    movd_r2m (mm1, dst[0]);
+    movd_r2m (mm1, *dst);
     lum_m4 += 4;
     lum_m3 += 4;
     lum_m2 += 4;
@@ -94,34 +121,12 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
     dst += 4;
   }
   emms ();
-}
-#endif
 
-/**
-  * C implementation.
-  */
-static void
-deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
-    uint8_t * lum_m3, uint8_t * lum_m2,
-    uint8_t * lum_m1, uint8_t * lum, int size)
-{
-  int sum;
-
-  for (; size > 0; size--) {
-    sum = -lum_m4[0];
-    sum += lum_m3[0] << 2;
-    sum += lum_m2[0] << 1;
-    sum += lum_m1[0] << 2;
-    sum += -lum[0];
-    dst[0] = (sum + 4) >> 3;    // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
-    lum_m4++;
-    lum_m3++;
-    lum_m2++;
-    lum_m1++;
-    lum++;
-    dst++;
-  }
+  /* Handle odd widths */
+  if (size > 0)
+    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
 }
+#endif
 
 /*
  * The commented-out method below that uses the bottom_field member is more
@@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object,
     deinterlace_scanline_data_t * data, uint8_t * output)
 {
 #ifdef HAVE_CPU_I386
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
-    deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+    deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
         data->bb1, object->frame_width * 2);
   } else {
     deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,
-- 
2.7.4