[MOVED FROM BAD 09/56] gst/deinterlace2/tvtime/greedy.c: Fix the C implementation...

author Sebastian Dröge <slomo@circular-chaos.org>

Tue, 24 Jun 2008 09:10:46 +0000 (09:10 +0000)

committer Sebastian Dröge <sebastian.droege@collabora.co.uk>

Wed, 13 May 2009 08:33:56 +0000 (10:33 +0200)
author Sebastian Dröge <slomo@circular-chaos.org>
Tue, 24 Jun 2008 09:10:46 +0000 (09:10 +0000)
committer Sebastian Dröge <sebastian.droege@collabora.co.uk>
Wed, 13 May 2009 08:33:56 +0000 (10:33 +0200)
diff --git a/gst/deinterlace2/tvtime/greedy.c b/gst/deinterlace2/tvtime/greedy.c

index c25af036e22a5b8da01ce4bbb9b0ecd156ce9c2d..66b8799d9aba8d910de542d1214bd9f1abf95dab 100644 (file)
--- a/gst/deinterlace2/tvtime/greedy.c
+++ b/gst/deinterlace2/tvtime/greedy.c
@@ -60,26 +60,65 @@ copy_scanline (GstDeinterlace2 * object,
    blit_packed422_scanline (output, data->m1, object->frame_width);
  }
  
-static int GreedyMaxComb = 15;
+static const int GreedyMaxComb = 15;
  
-#ifdef HAVE_CPU_I386
-#include "mmx.h"
-#include "sse.h"
-static void
-deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
-    deinterlace_scanline_data_t * data, uint8_t * output)
+static inline void
+deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
+    uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+    int width)
  {
-  mmx_t MaxComb;
+  int avg, l2_diff, lp2_diff, max, min, best;
+
+  // L2 == m0
+  // L1 == t1
+  // L3 == b1
+  // LP2 == m2
+
+  while (width--) {
+    avg = (*t1 + *b1) / 2;
+
+    l2_diff = ABS (*m0 - avg);
+    lp2_diff = ABS (*m2 - avg);
+
+    if (l2_diff > lp2_diff)
+      best = *m2;
+    else
+      best = *m0;
+
+    max = MAX (*t1, *b1);
+    min = MIN (*t1, *b1);
+
+    if (max < 256 - GreedyMaxComb)
+      max += GreedyMaxComb;
+    else
+      max = 255;
  
-  uint8_t *m0 = data->m0;
+    if (min > GreedyMaxComb)
+      min -= GreedyMaxComb;
+    else
+      min = 0;
  
-  uint8_t *t1 = data->t1;
+    *output = CLAMP (best, min, max);
  
-  uint8_t *b1 = data->b1;
+    // Advance to the next set of pixels.
+    output += 1;
+    m0 += 1;
+    t1 += 1;
+    b1 += 1;
+    m2 += 1;
+  }
+}
  
-  uint8_t *m2 = data->m2;
+#ifdef HAVE_CPU_I386
+#include "mmx.h"
+#include "sse.h"
  
-  int width = object->frame_width;
+static void
+deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
+    uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+    int width)
+{
+  mmx_t MaxComb;
  
    // How badly do we let it weave? 0-255
    MaxComb.ub[0] = GreedyMaxComb;
@@ -96,8 +135,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
    // L3 == b1
    // LP2 == m2
  
-  width /= 4;
-  while (width--) {
+  for (; width > 7; width -= 8) {
      movq_m2r (*t1, mm1);        // L1
      movq_m2r (*m0, mm2);        // L2
      movq_m2r (*b1, mm3);        // L3
@@ -107,15 +145,12 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
      movq_r2r (mm1, mm4);        // L1
      pavgb_r2r (mm3, mm4);       // (L1 + L3)/2
  
-
      // get abs value of possible L2 comb
      movq_r2r (mm2, mm7);        // L2
      psubusb_r2r (mm4, mm7);     // L2 - avg
      movq_r2r (mm4, mm5);        // avg
      psubusb_r2r (mm2, mm5);     // avg - L2
      por_r2r (mm7, mm5);         // abs(avg-L2)
-    movq_r2r (mm4, mm6);        // copy of avg for later
-
  
      // get abs value of possible LP2 comb
      movq_r2r (mm0, mm7);        // LP2
@@ -125,7 +160,7 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
  
      // use L2 or LP2 depending upon which makes smaller comb
      psubusb_r2r (mm5, mm4);     // see if it goes to zero
-    psubusb_r2r (mm5, mm5);     // 0
+    pxor_r2r (mm5, mm5);        // 0
      pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
      pcmpeqb_r2r (mm4, mm5);     // opposite of mm4
  
@@ -140,27 +175,19 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
      // detail than a boring oversmoothed clip.
  
      movq_r2r (mm1, mm2);        // copy L1
-    psubusb_r2r (mm3, mm2);     // - L3, with saturation
-    paddusb_r2r (mm3, mm2);     // now = Max(L1,L3)
+    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)
  
-    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
-    psubusb_r2r (mm1, mm7);     // - L1 
-    paddusb_r2r (mm7, mm3);     // add, may sat at fff..
-    psubusb_r2r (mm7, mm3);     // now = Min(L1,L3)
+    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)
  
      // allow the value to be above the high or below the low by amt of MaxComb
      paddusb_m2r (MaxComb, mm2); // increase max by diff
      psubusb_m2r (MaxComb, mm3); // lower min by diff
  
-    psubusb_r2r (mm3, mm4);     // best - Min
-    paddusb_r2r (mm3, mm4);     // now = Max(best,Min(L1,L3)
  
-    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
-    psubusb_r2r (mm4, mm7);     // - Max(best,Min(best,L3) 
-    paddusb_r2r (mm7, mm2);     // add may sat at FFF..
-    psubusb_r2r (mm7, mm2);     // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
+    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
  
-    movntq_r2m (mm2, *output);  // move in our clipped best
+    movq_r2m (mm2, *output);    // move in our clipped best
  
      // Advance to the next set of pixels.
      output += 8;
@@ -171,72 +198,29 @@ deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
    }
    sfence ();
    emms ();
-}
-#endif
-
-static void
-deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
-    deinterlace_scanline_data_t * data, uint8_t * output)
-{
-  uint8_t *m0 = data->m0;
-
-  uint8_t *t1 = data->t1;
-
-  uint8_t *b1 = data->b1;
-
-  uint8_t *m2 = data->m2;
-
-  int width = 2 * object->frame_width;
-
-  uint16_t avg, l2_diff, lp2_diff, max, min, best;
-
-  // L2 == m0
-  // L1 == t1
-  // L3 == b1
-  // LP2 == m2
-
-  while (width--) {
-    avg = (*t1 + *b1) / 2;
-
-    l2_diff = ABS (*m0 - avg);
-    lp2_diff = ABS (*m2 - avg);
-
-    if (l2_diff > lp2_diff)
-      best = *m2;
-    else
-      best = *m0;
-
-    max = MAX (*t1, *b1);
-    min = MIN (*t1, *b1);
-
-    if (max < 256 - GreedyMaxComb)
-      max += GreedyMaxComb;
-    if (min > GreedyMaxComb)
-      min -= GreedyMaxComb;
-
-    *output = MIN (MAX (best, min), max);
  
-    // Advance to the next set of pixels.
-    output += 1;
-    m0 += 1;
-    t1 += 1;
-    b1 += 1;
-    m2 += 1;
-  }
+  if (width > 0)
+    deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
+        width);
  }
  
+#endif
+
  static void
  deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
      deinterlace_scanline_data_t * data, uint8_t * output)
  {
  #ifdef HAVE_CPU_I386
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
-    deinterlace_greedy_packed422_scanline_sse (object, data, output);
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+    deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
+        data->b1, data->m2, output, 2 * object->frame_width);
    } else {
-    deinterlace_greedy_packed422_scanline_c (object, data, output);
+    deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
+        data->b1, data->m2, output, 2 * object->frame_width);
    }
  #else
-  deinterlace_greedy_packed422_scanline_c (object, data, output);
+  deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
+      data->m2, output, 2 * object->frame_width);
  #endif
  }
  
diff --git a/gst/deinterlace2/tvtime/vfir.c b/gst/deinterlace2/tvtime/vfir.c

index f32be65475880a447656e1ffdbc02a3e493b17dc..479ee440600b4a9de605e5759d7262936cb7522a 100644 (file)
--- a/gst/deinterlace2/tvtime/vfir.c
+++ b/gst/deinterlace2/tvtime/vfir.c
@@ -49,10 +49,36 @@
   * filter taps here are: [-1 4 2 4 -1].
   */
  
+/**
+  * C implementation.
+  */
+static inline void
+deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
+    uint8_t * lum_m3, uint8_t * lum_m2,
+    uint8_t * lum_m1, uint8_t * lum, int size)
+{
+  int sum;
+
+  for (; size > 0; size--) {
+    sum = -lum_m4[0];
+    sum += lum_m3[0] << 2;
+    sum += lum_m2[0] << 1;
+    sum += lum_m1[0] << 2;
+    sum += -lum[0];
+    dst[0] = (sum + 4) >> 3;    // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
+    lum_m4++;
+    lum_m3++;
+    lum_m2++;
+    lum_m1++;
+    lum++;
+    dst++;
+  }
+}
+
  #ifdef HAVE_CPU_I386
  #include "mmx.h"
  static void
-deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
+deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
      uint8_t * lum_m3, uint8_t * lum_m2,
      uint8_t * lum_m1, uint8_t * lum, int size)
  {
@@ -63,14 +89,15 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
    rounder.uw[2] = 4;
    rounder.uw[3] = 4;
    pxor_r2r (mm7, mm7);
-  movq_m2r (rounder, mm6);
+  movd_m2r (rounder, mm6);
+  punpcklbw_r2r (mm7, mm6);
  
    for (; size > 3; size -= 4) {
-    movd_m2r (lum_m4[0], mm0);
-    movd_m2r (lum_m3[0], mm1);
-    movd_m2r (lum_m2[0], mm2);
-    movd_m2r (lum_m1[0], mm3);
-    movd_m2r (lum[0], mm4);
+    movd_m2r (*lum_m4, mm0);
+    movd_m2r (*lum_m3, mm1);
+    movd_m2r (*lum_m2, mm2);
+    movd_m2r (*lum_m1, mm3);
+    movd_m2r (*lum, mm4);
      punpcklbw_r2r (mm7, mm0);
      punpcklbw_r2r (mm7, mm1);
      punpcklbw_r2r (mm7, mm2);
@@ -85,7 +112,7 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
      psubusw_r2r (mm0, mm1);
      psrlw_i2r (3, mm1);         // 3
      packuswb_r2r (mm7, mm1);
-    movd_r2m (mm1, dst[0]);
+    movd_r2m (mm1, *dst);
      lum_m4 += 4;
      lum_m3 += 4;
      lum_m2 += 4;
@@ -94,34 +121,12 @@ deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
      dst += 4;
    }
    emms ();
-}
-#endif
  
-/**
-  * C implementation.
-  */
-static void
-deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
-    uint8_t * lum_m3, uint8_t * lum_m2,
-    uint8_t * lum_m1, uint8_t * lum, int size)
-{
-  int sum;
-
-  for (; size > 0; size--) {
-    sum = -lum_m4[0];
-    sum += lum_m3[0] << 2;
-    sum += lum_m2[0] << 1;
-    sum += lum_m1[0] << 2;
-    sum += -lum[0];
-    dst[0] = (sum + 4) >> 3;    // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
-    lum_m4++;
-    lum_m3++;
-    lum_m2++;
-    lum_m1++;
-    lum++;
-    dst++;
-  }
+  /* Handle odd widths */
+  if (size > 0)
+    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
  }
+#endif
  
  /*
   * The commented-out method below that uses the bottom_field member is more
@@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object,
      deinterlace_scanline_data_t * data, uint8_t * output)
  {
  #ifdef HAVE_CPU_I386
-  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
-    deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
+  if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+    deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
          data->bb1, object->frame_width * 2);
    } else {
      deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,
author	Sebastian Dröge <slomo@circular-chaos.org>
	Tue, 24 Jun 2008 09:10:46 +0000 (09:10 +0000)
committer	Sebastian Dröge <sebastian.droege@collabora.co.uk>
	Wed, 13 May 2009 08:33:56 +0000 (10:33 +0200)
gst/deinterlace2/tvtime/greedy.c		patch \| blob \| history
gst/deinterlace2/tvtime/vfir.c		patch \| blob \| history