blit_packed422_scanline (output, data->m1, object->frame_width);
}
-static int GreedyMaxComb = 15;
+static const int GreedyMaxComb = 15;
-#ifdef HAVE_CPU_I386
-#include "mmx.h"
-#include "sse.h"
-static void
-deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
- deinterlace_scanline_data_t * data, uint8_t * output)
+static inline void
+deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
+ uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+ int width)
{
- mmx_t MaxComb;
+ int avg, l2_diff, lp2_diff, max, min, best;
+
+ // L2 == m0
+ // L1 == t1
+ // L3 == b1
+ // LP2 == m2
+
+ while (width--) {
+ avg = (*t1 + *b1) / 2;
+
+ l2_diff = ABS (*m0 - avg);
+ lp2_diff = ABS (*m2 - avg);
+
+ if (l2_diff > lp2_diff)
+ best = *m2;
+ else
+ best = *m0;
+
+ max = MAX (*t1, *b1);
+ min = MIN (*t1, *b1);
+
+ if (max < 256 - GreedyMaxComb)
+ max += GreedyMaxComb;
+ else
+ max = 255;
- uint8_t *m0 = data->m0;
+ if (min > GreedyMaxComb)
+ min -= GreedyMaxComb;
+ else
+ min = 0;
- uint8_t *t1 = data->t1;
+ *output = CLAMP (best, min, max);
- uint8_t *b1 = data->b1;
+ // Advance to the next set of pixels.
+ output += 1;
+ m0 += 1;
+ t1 += 1;
+ b1 += 1;
+ m2 += 1;
+ }
+}
- uint8_t *m2 = data->m2;
+#ifdef HAVE_CPU_I386
+#include "mmx.h"
+#include "sse.h"
- int width = object->frame_width;
+static void
+deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
+ uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
+ int width)
+{
+ mmx_t MaxComb;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = GreedyMaxComb;
// L3 == b1
// LP2 == m2
- width /= 4;
- while (width--) {
+ for (; width > 7; width -= 8) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
movq_r2r (mm1, mm4); // L1
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
-
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
- movq_r2r (mm4, mm6); // copy of avg for later
-
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
- psubusb_r2r (mm5, mm5); // 0
+ pxor_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
- psubusb_r2r (mm3, mm2); // - L3, with saturation
- paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
+ pmaxub_r2r (mm3, mm2); // now = Max(L1,L3)
- pcmpeqb_r2r (mm7, mm7); // all ffffffff
- psubusb_r2r (mm1, mm7); // - L1
- paddusb_r2r (mm7, mm3); // add, may sat at fff..
- psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
+ pminub_r2r (mm1, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_m2r (MaxComb, mm2); // increase max by diff
psubusb_m2r (MaxComb, mm3); // lower min by diff
- psubusb_r2r (mm3, mm4); // best - Min
- paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
- pcmpeqb_r2r (mm7, mm7); // all ffffffff
- psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
- paddusb_r2r (mm7, mm2); // add may sat at FFF..
- psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
+ pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
+ pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
- movntq_r2m (mm2, *output); // move in our clipped best
+ movq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
}
sfence ();
emms ();
-}
-#endif
-
-static void
-deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
- deinterlace_scanline_data_t * data, uint8_t * output)
-{
- uint8_t *m0 = data->m0;
-
- uint8_t *t1 = data->t1;
-
- uint8_t *b1 = data->b1;
-
- uint8_t *m2 = data->m2;
-
- int width = 2 * object->frame_width;
-
- uint16_t avg, l2_diff, lp2_diff, max, min, best;
-
- // L2 == m0
- // L1 == t1
- // L3 == b1
- // LP2 == m2
-
- while (width--) {
- avg = (*t1 + *b1) / 2;
-
- l2_diff = ABS (*m0 - avg);
- lp2_diff = ABS (*m2 - avg);
-
- if (l2_diff > lp2_diff)
- best = *m2;
- else
- best = *m0;
-
- max = MAX (*t1, *b1);
- min = MIN (*t1, *b1);
-
- if (max < 256 - GreedyMaxComb)
- max += GreedyMaxComb;
- if (min > GreedyMaxComb)
- min -= GreedyMaxComb;
-
- *output = MIN (MAX (best, min), max);
- // Advance to the next set of pixels.
- output += 1;
- m0 += 1;
- t1 += 1;
- b1 += 1;
- m2 += 1;
- }
+ if (width > 0)
+ deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
+ width);
}
+#endif
+
static void
deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
{
#ifdef HAVE_CPU_I386
- if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
- deinterlace_greedy_packed422_scanline_sse (object, data, output);
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
+ deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
+ data->b1, data->m2, output, 2 * object->frame_width);
} else {
- deinterlace_greedy_packed422_scanline_c (object, data, output);
+ deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
+ data->b1, data->m2, output, 2 * object->frame_width);
}
#else
- deinterlace_greedy_packed422_scanline_c (object, data, output);
+ deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
+ data->m2, output, 2 * object->frame_width);
#endif
}
* filter taps here are: [-1 4 2 4 -1].
*/
+/**
+ * C implementation.
+ */
+static inline void
+deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
+ uint8_t * lum_m3, uint8_t * lum_m2,
+ uint8_t * lum_m1, uint8_t * lum, int size)
+{
+ int sum;
+
+ for (; size > 0; size--) {
+ sum = -lum_m4[0];
+ sum += lum_m3[0] << 2;
+ sum += lum_m2[0] << 1;
+ sum += lum_m1[0] << 2;
+ sum += -lum[0];
+ dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
+ lum_m4++;
+ lum_m3++;
+ lum_m2++;
+ lum_m1++;
+ lum++;
+ dst++;
+ }
+}
+
#ifdef HAVE_CPU_I386
#include "mmx.h"
static void
-deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
+deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
uint8_t * lum_m3, uint8_t * lum_m2,
uint8_t * lum_m1, uint8_t * lum, int size)
{
rounder.uw[2] = 4;
rounder.uw[3] = 4;
pxor_r2r (mm7, mm7);
- movq_m2r (rounder, mm6);
+ movd_m2r (rounder, mm6);
+ punpcklbw_r2r (mm7, mm6);
for (; size > 3; size -= 4) {
- movd_m2r (lum_m4[0], mm0);
- movd_m2r (lum_m3[0], mm1);
- movd_m2r (lum_m2[0], mm2);
- movd_m2r (lum_m1[0], mm3);
- movd_m2r (lum[0], mm4);
+ movd_m2r (*lum_m4, mm0);
+ movd_m2r (*lum_m3, mm1);
+ movd_m2r (*lum_m2, mm2);
+ movd_m2r (*lum_m1, mm3);
+ movd_m2r (*lum, mm4);
punpcklbw_r2r (mm7, mm0);
punpcklbw_r2r (mm7, mm1);
punpcklbw_r2r (mm7, mm2);
psubusw_r2r (mm0, mm1);
psrlw_i2r (3, mm1); // 3
packuswb_r2r (mm7, mm1);
- movd_r2m (mm1, dst[0]);
+ movd_r2m (mm1, *dst);
lum_m4 += 4;
lum_m3 += 4;
lum_m2 += 4;
dst += 4;
}
emms ();
-}
-#endif
-/**
- * C implementation.
- */
-static void
-deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
- uint8_t * lum_m3, uint8_t * lum_m2,
- uint8_t * lum_m1, uint8_t * lum, int size)
-{
- int sum;
-
- for (; size > 0; size--) {
- sum = -lum_m4[0];
- sum += lum_m3[0] << 2;
- sum += lum_m2[0] << 1;
- sum += lum_m1[0] << 2;
- sum += -lum[0];
- dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
- lum_m4++;
- lum_m3++;
- lum_m2++;
- lum_m1++;
- lum++;
- dst++;
- }
+ /* Handle odd widths */
+ if (size > 0)
+ deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
+#endif
/*
* The commented-out method below that uses the bottom_field member is more
deinterlace_scanline_data_t * data, uint8_t * output)
{
#ifdef HAVE_CPU_I386
- if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
- deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
+ if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
+ deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
data->bb1, object->frame_width * 2);
} else {
deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,