From 6143a60bdb30b8be017e62d615caa2c5e52631a6 Mon Sep 17 00:00:00 2001 From: David Schleef Date: Sun, 5 Sep 2010 18:40:48 -0700 Subject: [PATCH] deinterlace: Fix greedyl Orc implementation To agree with the previous C/asm code. --- gst/deinterlace/tvtime-dist.c | 294 ++++++++++++++++++---------------------- gst/deinterlace/tvtime.orc | 32 +++-- gst/deinterlace/tvtime/greedy.c | 276 ------------------------------------- 3 files changed, 148 insertions(+), 454 deletions(-) diff --git a/gst/deinterlace/tvtime-dist.c b/gst/deinterlace/tvtime-dist.c index 1129d9a..172eea4 100644 --- a/gst/deinterlace/tvtime-dist.c +++ b/gst/deinterlace/tvtime-dist.c @@ -688,10 +688,6 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1, const orc_int8 *ORC_RESTRICT ptr5; const orc_int8 *ORC_RESTRICT ptr6; const orc_int8 *ORC_RESTRICT ptr7; - orc_int8 var40; - orc_int8 var41; - orc_int8 var42; - orc_int8 var43; orc_int8 var44; orc_int8 var45; orc_int8 var46; @@ -719,10 +715,6 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1, orc_int8 var68; orc_int8 var69; orc_int8 var70; - orc_int8 var71; - orc_int8 var72; - orc_int8 var73; - orc_int8 var74; ptr0 = (orc_int8 *) d1; ptr4 = (orc_int8 *) s1; @@ -730,80 +722,64 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1, ptr6 = (orc_int8 *) s3; ptr7 = (orc_int8 *) s4; + /* 11: loadpb */ + var44 = 0x00000080; /* 128 or 6.32404e-322f */ /* 13: loadpb */ - var46 = 0x00000080; /* 128 or 6.32404e-322f */ - /* 15: loadpb */ - var47 = 0x00000080; /* 128 or 6.32404e-322f */ - /* 29: loadpb */ - var54 = p1; - /* 31: loadpb */ - var55 = p1; + var45 = 0x00000080; /* 128 or 6.32404e-322f */ + /* 21: loadpb */ + var46 = p1; + /* 23: loadpb */ + var47 = p1; for (i = 0; i < n; i++) { /* 0: loadb */ - var40 = ptr5[i]; + var49 = ptr4[i]; /* 1: loadb */ - var41 = ptr6[i]; - /* 2: avgub */ - var57 = ((orc_uint8) var40 + (orc_uint8) var41 + 1) >> 1; + var50 = ptr7[i]; + /* 2: loadb */ + var51 = ptr6[i]; /* 3: loadb */ - var42 = ptr4[i]; - /* 4: maxub */ - var58 = ORC_MAX ((orc_uint8) var42, (orc_uint8) var57); - /* 5: loadb */ - var43 = ptr4[i]; + var52 = ptr5[i]; + /* 4: avgub */ + var53 = ((orc_uint8) var52 + (orc_uint8) var51 + 1) >> 1; + /* 5: maxub */ + var54 = ORC_MAX ((orc_uint8) var49, (orc_uint8) var53); /* 6: minub */ - var59 = ORC_MIN ((orc_uint8) var43, (orc_uint8) var57); + var55 = ORC_MIN ((orc_uint8) var49, (orc_uint8) var53); /* 7: subb */ - var60 = var58 - var59; - /* 8: loadb */ - var44 = ptr7[i]; - /* 9: maxub */ - var61 = ORC_MAX ((orc_uint8) var44, (orc_uint8) var57); - /* 10: loadb */ - var45 = ptr7[i]; - /* 11: minub */ - var62 = ORC_MIN ((orc_uint8) var45, (orc_uint8) var57); - /* 12: subb */ - var63 = var61 - var62; + var56 = var54 - var55; + /* 8: maxub */ + var57 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var53); + /* 9: minub */ + var58 = ORC_MIN ((orc_uint8) var50, (orc_uint8) var53); + /* 10: subb */ + var59 = var57 - var58; + /* 12: xorb */ + var60 = var56 ^ var44; /* 14: xorb */ - var64 = var60 ^ var46; - /* 16: xorb */ - var65 = var63 ^ var47; - /* 17: cmpgtsb */ - var66 = (var64 > var65) ? (~0) : 0; - /* 18: loadb */ - var48 = ptr4[i]; - /* 19: andb */ - var67 = var48 & var66; - /* 20: loadb */ - var49 = ptr7[i]; - /* 21: andnb */ - var68 = (~var49) & var66; - /* 22: orb */ - var69 = var67 | var68; - /* 23: loadb */ - var50 = ptr5[i]; - /* 24: loadb */ - var51 = ptr6[i]; - /* 25: maxub */ - var70 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var51); - /* 26: loadb */ - var52 = ptr5[i]; - /* 27: loadb */ - var53 = ptr6[i]; - /* 28: minub */ - var71 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var53); - /* 30: addusb */ - var72 = ORC_CLAMP_UB ((orc_uint8) var70 + (orc_uint8) var54); - /* 32: subusb */ - var73 = ORC_CLAMP_UB ((orc_uint8) var71 - (orc_uint8) var55); - /* 33: minub */ - var74 = ORC_MIN ((orc_uint8) var69, (orc_uint8) var72); - /* 34: maxub */ - var56 = ORC_MAX ((orc_uint8) var74, (orc_uint8) var73); - /* 35: storeb */ - ptr0[i] = var56; + var61 = var59 ^ var45; + /* 15: cmpgtsb */ + var62 = (var60 > var61) ? (~0) : 0; + /* 16: andb */ + var63 = var50 & var62; + /* 17: andnb */ + var64 = (~var62) & var49; + /* 18: orb */ + var65 = var63 | var64; + /* 19: maxub */ + var66 = ORC_MAX ((orc_uint8) var52, (orc_uint8) var51); + /* 20: minub */ + var67 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var51); + /* 22: addusb */ + var68 = ORC_CLAMP_UB ((orc_uint8) var66 + (orc_uint8) var46); + /* 24: subusb */ + var69 = ORC_CLAMP_UB ((orc_uint8) var67 - (orc_uint8) var47); + /* 25: minub */ + var70 = ORC_MIN ((orc_uint8) var65, (orc_uint8) var68); + /* 26: maxub */ + var48 = ORC_MAX ((orc_uint8) var70, (orc_uint8) var69); + /* 27: storeb */ + ptr0[i] = var48; } } @@ -819,10 +795,6 @@ _backup_deinterlace_line_greedy (OrcExecutor * ORC_RESTRICT ex) const orc_int8 *ORC_RESTRICT ptr5; const orc_int8 *ORC_RESTRICT ptr6; const orc_int8 *ORC_RESTRICT ptr7; - orc_int8 var40; - orc_int8 var41; - orc_int8 var42; - orc_int8 var43; orc_int8 var44; orc_int8 var45; orc_int8 var46; @@ -850,10 +822,6 @@ _backup_deinterlace_line_greedy (OrcExecutor * ORC_RESTRICT ex) orc_int8 var68; orc_int8 var69; orc_int8 var70; - orc_int8 var71; - orc_int8 var72; - orc_int8 var73; - orc_int8 var74; ptr0 = (orc_int8 *) ex->arrays[0]; ptr4 = (orc_int8 *) ex->arrays[4]; @@ -861,80 +829,64 @@ _backup_deinterlace_line_greedy (OrcExecutor * ORC_RESTRICT ex) ptr6 = (orc_int8 *) ex->arrays[6]; ptr7 = (orc_int8 *) ex->arrays[7]; + /* 11: loadpb */ + var44 = 0x00000080; /* 128 or 6.32404e-322f */ /* 13: loadpb */ - var46 = 0x00000080; /* 128 or 6.32404e-322f */ - /* 15: loadpb */ - var47 = 0x00000080; /* 128 or 6.32404e-322f */ - /* 29: loadpb */ - var54 = ex->params[24]; - /* 31: loadpb */ - var55 = ex->params[24]; + var45 = 0x00000080; /* 128 or 6.32404e-322f */ + /* 21: loadpb */ + var46 = ex->params[24]; + /* 23: loadpb */ + var47 = ex->params[24]; for (i = 0; i < n; i++) { /* 0: loadb */ - var40 = ptr5[i]; + var49 = ptr4[i]; /* 1: loadb */ - var41 = ptr6[i]; - /* 2: avgub */ - var57 = ((orc_uint8) var40 + (orc_uint8) var41 + 1) >> 1; + var50 = ptr7[i]; + /* 2: loadb */ + var51 = ptr6[i]; /* 3: loadb */ - var42 = ptr4[i]; - /* 4: maxub */ - var58 = ORC_MAX ((orc_uint8) var42, (orc_uint8) var57); - /* 5: loadb */ - var43 = ptr4[i]; + var52 = ptr5[i]; + /* 4: avgub */ + var53 = ((orc_uint8) var52 + (orc_uint8) var51 + 1) >> 1; + /* 5: maxub */ + var54 = ORC_MAX ((orc_uint8) var49, (orc_uint8) var53); /* 6: minub */ - var59 = ORC_MIN ((orc_uint8) var43, (orc_uint8) var57); + var55 = ORC_MIN ((orc_uint8) var49, (orc_uint8) var53); /* 7: subb */ - var60 = var58 - var59; - /* 8: loadb */ - var44 = ptr7[i]; - /* 9: maxub */ - var61 = ORC_MAX ((orc_uint8) var44, (orc_uint8) var57); - /* 10: loadb */ - var45 = ptr7[i]; - /* 11: minub */ - var62 = ORC_MIN ((orc_uint8) var45, (orc_uint8) var57); - /* 12: subb */ - var63 = var61 - var62; + var56 = var54 - var55; + /* 8: maxub */ + var57 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var53); + /* 9: minub */ + var58 = ORC_MIN ((orc_uint8) var50, (orc_uint8) var53); + /* 10: subb */ + var59 = var57 - var58; + /* 12: xorb */ + var60 = var56 ^ var44; /* 14: xorb */ - var64 = var60 ^ var46; - /* 16: xorb */ - var65 = var63 ^ var47; - /* 17: cmpgtsb */ - var66 = (var64 > var65) ? (~0) : 0; - /* 18: loadb */ - var48 = ptr4[i]; - /* 19: andb */ - var67 = var48 & var66; - /* 20: loadb */ - var49 = ptr7[i]; - /* 21: andnb */ - var68 = (~var49) & var66; - /* 22: orb */ - var69 = var67 | var68; - /* 23: loadb */ - var50 = ptr5[i]; - /* 24: loadb */ - var51 = ptr6[i]; - /* 25: maxub */ - var70 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var51); - /* 26: loadb */ - var52 = ptr5[i]; - /* 27: loadb */ - var53 = ptr6[i]; - /* 28: minub */ - var71 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var53); - /* 30: addusb */ - var72 = ORC_CLAMP_UB ((orc_uint8) var70 + (orc_uint8) var54); - /* 32: subusb */ - var73 = ORC_CLAMP_UB ((orc_uint8) var71 - (orc_uint8) var55); - /* 33: minub */ - var74 = ORC_MIN ((orc_uint8) var69, (orc_uint8) var72); - /* 34: maxub */ - var56 = ORC_MAX ((orc_uint8) var74, (orc_uint8) var73); - /* 35: storeb */ - ptr0[i] = var56; + var61 = var59 ^ var45; + /* 15: cmpgtsb */ + var62 = (var60 > var61) ? (~0) : 0; + /* 16: andb */ + var63 = var50 & var62; + /* 17: andnb */ + var64 = (~var62) & var49; + /* 18: orb */ + var65 = var63 | var64; + /* 19: maxub */ + var66 = ORC_MAX ((orc_uint8) var52, (orc_uint8) var51); + /* 20: minub */ + var67 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var51); + /* 22: addusb */ + var68 = ORC_CLAMP_UB ((orc_uint8) var66 + (orc_uint8) var46); + /* 24: subusb */ + var69 = ORC_CLAMP_UB ((orc_uint8) var67 - (orc_uint8) var47); + /* 25: minub */ + var70 = ORC_MIN ((orc_uint8) var65, (orc_uint8) var68); + /* 26: maxub */ + var48 = ORC_MAX ((orc_uint8) var70, (orc_uint8) var69); + /* 27: storeb */ + ptr0[i] = var48; } } @@ -972,44 +924,56 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1, orc_program_add_temporary (p, 1, "t6"); orc_program_add_temporary (p, 1, "t7"); orc_program_add_temporary (p, 1, "t8"); + orc_program_add_temporary (p, 1, "t9"); + orc_program_add_temporary (p, 1, "t10"); + orc_program_add_temporary (p, 1, "t11"); + orc_program_add_temporary (p, 1, "t12"); - orc_program_append_2 (p, "avgub", 0, ORC_VAR_T1, ORC_VAR_S2, ORC_VAR_S3, + orc_program_append_2 (p, "loadb", 0, ORC_VAR_T1, ORC_VAR_S1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "loadb", 0, ORC_VAR_T2, ORC_VAR_S4, ORC_VAR_D1, ORC_VAR_D1); - orc_program_append_2 (p, "maxub", 0, ORC_VAR_T4, ORC_VAR_S1, ORC_VAR_T1, + orc_program_append_2 (p, "loadb", 0, ORC_VAR_T3, ORC_VAR_S3, ORC_VAR_D1, ORC_VAR_D1); - orc_program_append_2 (p, "minub", 0, ORC_VAR_T5, ORC_VAR_S1, ORC_VAR_T1, + orc_program_append_2 (p, "loadb", 0, ORC_VAR_T4, ORC_VAR_S2, ORC_VAR_D1, ORC_VAR_D1); - orc_program_append_2 (p, "subb", 0, ORC_VAR_T2, ORC_VAR_T4, ORC_VAR_T5, + orc_program_append_2 (p, "avgub", 0, ORC_VAR_T5, ORC_VAR_T4, ORC_VAR_T3, ORC_VAR_D1); - orc_program_append_2 (p, "maxub", 0, ORC_VAR_T4, ORC_VAR_S4, ORC_VAR_T1, + orc_program_append_2 (p, "maxub", 0, ORC_VAR_T8, ORC_VAR_T1, ORC_VAR_T5, ORC_VAR_D1); - orc_program_append_2 (p, "minub", 0, ORC_VAR_T5, ORC_VAR_S4, ORC_VAR_T1, + orc_program_append_2 (p, "minub", 0, ORC_VAR_T9, ORC_VAR_T1, ORC_VAR_T5, ORC_VAR_D1); - orc_program_append_2 (p, "subb", 0, ORC_VAR_T3, ORC_VAR_T4, ORC_VAR_T5, + orc_program_append_2 (p, "subb", 0, ORC_VAR_T6, ORC_VAR_T8, ORC_VAR_T9, ORC_VAR_D1); - orc_program_append_2 (p, "xorb", 0, ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_C1, + orc_program_append_2 (p, "maxub", 0, ORC_VAR_T8, ORC_VAR_T2, ORC_VAR_T5, ORC_VAR_D1); - orc_program_append_2 (p, "xorb", 0, ORC_VAR_T3, ORC_VAR_T3, ORC_VAR_C1, + orc_program_append_2 (p, "minub", 0, ORC_VAR_T9, ORC_VAR_T2, ORC_VAR_T5, ORC_VAR_D1); - orc_program_append_2 (p, "cmpgtsb", 0, ORC_VAR_T5, ORC_VAR_T2, ORC_VAR_T3, + orc_program_append_2 (p, "subb", 0, ORC_VAR_T7, ORC_VAR_T8, ORC_VAR_T9, ORC_VAR_D1); - orc_program_append_2 (p, "andb", 0, ORC_VAR_T4, ORC_VAR_S1, ORC_VAR_T5, + orc_program_append_2 (p, "xorb", 0, ORC_VAR_T6, ORC_VAR_T6, ORC_VAR_C1, ORC_VAR_D1); - orc_program_append_2 (p, "andnb", 0, ORC_VAR_T5, ORC_VAR_S4, ORC_VAR_T5, + orc_program_append_2 (p, "xorb", 0, ORC_VAR_T7, ORC_VAR_T7, ORC_VAR_C1, ORC_VAR_D1); - orc_program_append_2 (p, "orb", 0, ORC_VAR_T6, ORC_VAR_T4, ORC_VAR_T5, + orc_program_append_2 (p, "cmpgtsb", 0, ORC_VAR_T9, ORC_VAR_T6, ORC_VAR_T7, ORC_VAR_D1); - orc_program_append_2 (p, "maxub", 0, ORC_VAR_T8, ORC_VAR_S2, ORC_VAR_S3, + orc_program_append_2 (p, "andb", 0, ORC_VAR_T8, ORC_VAR_T2, ORC_VAR_T9, ORC_VAR_D1); - orc_program_append_2 (p, "minub", 0, ORC_VAR_T7, ORC_VAR_S2, ORC_VAR_S3, + orc_program_append_2 (p, "andnb", 0, ORC_VAR_T9, ORC_VAR_T9, ORC_VAR_T1, ORC_VAR_D1); - orc_program_append_2 (p, "addusb", 0, ORC_VAR_T8, ORC_VAR_T8, ORC_VAR_P1, + orc_program_append_2 (p, "orb", 0, ORC_VAR_T10, ORC_VAR_T8, ORC_VAR_T9, ORC_VAR_D1); - orc_program_append_2 (p, "subusb", 0, ORC_VAR_T7, ORC_VAR_T7, ORC_VAR_P1, + orc_program_append_2 (p, "maxub", 0, ORC_VAR_T12, ORC_VAR_T4, ORC_VAR_T3, ORC_VAR_D1); - orc_program_append_2 (p, "minub", 0, ORC_VAR_T6, ORC_VAR_T6, ORC_VAR_T8, + orc_program_append_2 (p, "minub", 0, ORC_VAR_T11, ORC_VAR_T4, ORC_VAR_T3, ORC_VAR_D1); - orc_program_append_2 (p, "maxub", 0, ORC_VAR_D1, ORC_VAR_T6, ORC_VAR_T7, + orc_program_append_2 (p, "addusb", 0, ORC_VAR_T12, ORC_VAR_T12, + ORC_VAR_P1, ORC_VAR_D1); + orc_program_append_2 (p, "subusb", 0, ORC_VAR_T11, ORC_VAR_T11, + ORC_VAR_P1, ORC_VAR_D1); + orc_program_append_2 (p, "minub", 0, ORC_VAR_T10, ORC_VAR_T10, + ORC_VAR_T12, ORC_VAR_D1); + orc_program_append_2 (p, "maxub", 0, ORC_VAR_D1, ORC_VAR_T10, ORC_VAR_T11, ORC_VAR_D1); result = orc_program_compile (p); diff --git a/gst/deinterlace/tvtime.orc b/gst/deinterlace/tvtime.orc index f09f539..44217e7 100644 --- a/gst/deinterlace/tvtime.orc +++ b/gst/deinterlace/tvtime.orc @@ -61,6 +61,10 @@ convsuswb d1, t1 .source 1 b1 .source 1 m2 .param 1 max_comb +.temp 1 tm0 +.temp 1 tm2 +.temp 1 tb1 +.temp 1 tt1 .temp 1 avg .temp 1 l2_diff .temp 1 lp2_diff @@ -71,29 +75,31 @@ convsuswb d1, t1 .temp 1 max -avgub avg, t1, b1 -#absdiffb l2_diff, m0, avg -maxub t2, m0, avg -minub t3, m0, avg +loadb tm0, m0 +loadb tm2, m2 + +loadb tb1, b1 +loadb tt1, t1 +avgub avg, tt1, tb1 + +maxub t2, tm0, avg +minub t3, tm0, avg subb l2_diff, t2, t3 -#absdiffb lp2_diff, m2, avg -maxub t2, m2, avg -minub t3, m2, avg +maxub t2, tm2, avg +minub t3, tm2, avg subb lp2_diff, t2, t3 -#cmpgtub t1, l2_diff, lp2_diff xorb l2_diff, l2_diff, 0x80 xorb lp2_diff, lp2_diff, 0x80 cmpgtsb t3, l2_diff, lp2_diff -#selectb best, m0, m2, t3 -andb t2, m0, t3 -andnb t3, m2, t3 +andb t2, tm2, t3 +andnb t3, t3, tm0 orb best, t2, t3 -maxub max, t1, b1 -minub min, t1, b1 +maxub max, tt1, tb1 +minub min, tt1, tb1 addusb max, max, max_comb subusb min, min, max_comb minub best, best, max diff --git a/gst/deinterlace/tvtime/greedy.c b/gst/deinterlace/tvtime/greedy.c index 7323045..383bc34 100644 --- a/gst/deinterlace/tvtime/greedy.c +++ b/gst/deinterlace/tvtime/greedy.c @@ -34,9 +34,6 @@ #include "gstdeinterlacemethod.h" #include -#ifdef HAVE_ORC -#include -#endif #include "tvtime.h" @@ -80,54 +77,6 @@ typedef struct // Blended Clip but this give too good results for the CPU to ignore here. static inline void -deinterlace_greedy_scanline_c (GstDeinterlaceMethodGreedyL * self, - const guint8 * m0, const guint8 * t1, - const guint8 * b1, const guint8 * m2, guint8 * output, gint width) -{ - gint avg, l2_diff, lp2_diff, max, min, best; - guint max_comb = self->max_comb; - - // L2 == m0 - // L1 == t1 - // L3 == b1 - // LP2 == m2 - - while (width--) { - avg = (*t1 + *b1) / 2; - - l2_diff = ABS (*m0 - avg); - lp2_diff = ABS (*m2 - avg); - - if (l2_diff > lp2_diff) - best = *m2; - else - best = *m0; - - max = MAX (*t1, *b1); - min = MIN (*t1, *b1); - - if (max < 256 - max_comb) - max += max_comb; - else - max = 255; - - if (min > max_comb) - min -= max_comb; - else - min = 0; - - *output = CLAMP (best, min, max); - - // Advance to the next set of pixels. - output += 1; - m0 += 1; - t1 += 1; - b1 += 1; - m2 += 1; - } -} - -static inline void deinterlace_greedy_scanline_orc (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) @@ -135,216 +84,6 @@ deinterlace_greedy_scanline_orc (GstDeinterlaceMethodGreedyL * self, deinterlace_line_greedy (output, m0, t1, b1, m2, self->max_comb, width); } -#ifdef BUILD_X86_ASM -#include "mmx.h" -static void -deinterlace_greedy_scanline_mmx (GstDeinterlaceMethodGreedyL * self, - const guint8 * m0, const guint8 * t1, - const guint8 * b1, const guint8 * m2, guint8 * output, gint width) -{ - mmx_t MaxComb; - mmx_t ShiftMask; - - // How badly do we let it weave? 0-255 - MaxComb.ub[0] = self->max_comb; - MaxComb.ub[1] = self->max_comb; - MaxComb.ub[2] = self->max_comb; - MaxComb.ub[3] = self->max_comb; - MaxComb.ub[4] = self->max_comb; - MaxComb.ub[5] = self->max_comb; - MaxComb.ub[6] = self->max_comb; - MaxComb.ub[7] = self->max_comb; - - ShiftMask.ub[0] = 0x7f; - ShiftMask.ub[1] = 0x7f; - ShiftMask.ub[2] = 0x7f; - ShiftMask.ub[3] = 0x7f; - ShiftMask.ub[4] = 0x7f; - ShiftMask.ub[5] = 0x7f; - ShiftMask.ub[6] = 0x7f; - ShiftMask.ub[7] = 0x7f; - - // L2 == m0 - // L1 == t1 - // L3 == b1 - // LP2 == m2 - - movq_m2r (MaxComb, mm6); - - for (; width > 7; width -= 8) { - movq_m2r (*t1, mm1); // L1 - movq_m2r (*m0, mm2); // L2 - movq_m2r (*b1, mm3); // L3 - movq_m2r (*m2, mm0); // LP2 - - // average L1 and L3 leave result in mm4 - movq_r2r (mm1, mm4); // L1 - movq_r2r (mm3, mm5); // L3 - psrlw_i2r (1, mm4); // L1/2 - pand_m2r (ShiftMask, mm4); - psrlw_i2r (1, mm5); // L3/2 - pand_m2r (ShiftMask, mm5); - paddusb_r2r (mm5, mm4); // (L1 + L3) / 2 - - // get abs value of possible L2 comb - movq_r2r (mm2, mm7); // L2 - psubusb_r2r (mm4, mm7); // L2 - avg - movq_r2r (mm4, mm5); // avg - psubusb_r2r (mm2, mm5); // avg - L2 - por_r2r (mm7, mm5); // abs(avg-L2) - - // get abs value of possible LP2 comb - movq_r2r (mm0, mm7); // LP2 - psubusb_r2r (mm4, mm7); // LP2 - avg - psubusb_r2r (mm0, mm4); // avg - LP2 - por_r2r (mm7, mm4); // abs(avg-LP2) - - // use L2 or LP2 depending upon which makes smaller comb - psubusb_r2r (mm5, mm4); // see if it goes to zero - psubusb_r2r (mm5, mm5); // 0 - pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 - pcmpeqb_r2r (mm4, mm5); // opposite of mm4 - - // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 - pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 - pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 - por_r2r (mm5, mm4); // may the best win - - // Now lets clip our chosen value to be not outside of the range - // of the high/low range L1-L3 by more than abs(L1-L3) - // This allows some comb but limits the damages and also allows more - // detail than a boring oversmoothed clip. - - movq_r2r (mm1, mm2); // copy L1 - psubusb_r2r (mm3, mm2); // - L3, with saturation - paddusb_r2r (mm3, mm2); // now = Max(L1,L3) - - pcmpeqb_r2r (mm7, mm7); // all ffffffff - psubusb_r2r (mm1, mm7); // - L1 - paddusb_r2r (mm7, mm3); // add, may sat at fff.. - psubusb_r2r (mm7, mm3); // now = Min(L1,L3) - - // allow the value to be above the high or below the low by amt of MaxComb - paddusb_r2r (mm6, mm2); // increase max by diff - psubusb_r2r (mm6, mm3); // lower min by diff - - psubusb_r2r (mm3, mm4); // best - Min - paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) - - pcmpeqb_r2r (mm7, mm7); // all ffffffff - psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) - paddusb_r2r (mm7, mm2); // add may sat at FFF.. - psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped - - movq_r2m (mm2, *output); // move in our clipped best - - // Advance to the next set of pixels. - output += 8; - m0 += 8; - t1 += 8; - b1 += 8; - m2 += 8; - } - emms (); - if (width > 0) - deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); -} - -#include "sse.h" - -static void -deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL * - self, const guint8 * m0, const guint8 * t1, const guint8 * b1, - const guint8 * m2, guint8 * output, gint width) -{ - mmx_t MaxComb; - - // How badly do we let it weave? 0-255 - MaxComb.ub[0] = self->max_comb; - MaxComb.ub[1] = self->max_comb; - MaxComb.ub[2] = self->max_comb; - MaxComb.ub[3] = self->max_comb; - MaxComb.ub[4] = self->max_comb; - MaxComb.ub[5] = self->max_comb; - MaxComb.ub[6] = self->max_comb; - MaxComb.ub[7] = self->max_comb; - - // L2 == m0 - // L1 == t1 - // L3 == b1 - // LP2 == m2 - - movq_m2r (MaxComb, mm6); - - for (; width > 7; width -= 8) { - movq_m2r (*t1, mm1); // L1 - movq_m2r (*m0, mm2); // L2 - movq_m2r (*b1, mm3); // L3 - movq_m2r (*m2, mm0); // LP2 - - // average L1 and L3 leave result in mm4 - movq_r2r (mm1, mm4); // L1 - pavgb_r2r (mm3, mm4); // (L1 + L3)/2 - - // get abs value of possible L2 comb - movq_r2r (mm2, mm7); // L2 - psubusb_r2r (mm4, mm7); // L2 - avg - movq_r2r (mm4, mm5); // avg - psubusb_r2r (mm2, mm5); // avg - L2 - por_r2r (mm7, mm5); // abs(avg-L2) - - // get abs value of possible LP2 comb - movq_r2r (mm0, mm7); // LP2 - psubusb_r2r (mm4, mm7); // LP2 - avg - psubusb_r2r (mm0, mm4); // avg - LP2 - por_r2r (mm7, mm4); // abs(avg-LP2) - - // use L2 or LP2 depending upon which makes smaller comb - psubusb_r2r (mm5, mm4); // see if it goes to zero - pxor_r2r (mm5, mm5); // 0 - pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 - pcmpeqb_r2r (mm4, mm5); // opposite of mm4 - - // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 - pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 - pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 - por_r2r (mm5, mm4); // may the best win - - // Now lets clip our chosen value to be not outside of the range - // of the high/low range L1-L3 by more than abs(L1-L3) - // This allows some comb but limits the damages and also allows more - // detail than a boring oversmoothed clip. - - movq_r2r (mm1, mm2); // copy L1 - pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) - - pminub_r2r (mm1, mm3); // now = Min(L1,L3) - - // allow the value to be above the high or below the low by amt of MaxComb - paddusb_r2r (mm6, mm2); // increase max by diff - psubusb_r2r (mm6, mm3); // lower min by diff - - - pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) - pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped - - movq_r2m (mm2, *output); // move in our clipped best - - // Advance to the next set of pixels. - output += 8; - m0 += 8; - t1 += 8; - b1 += 8; - m2 += 8; - } - emms (); - - if (width > 0) - deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); -} - -#endif - static void deinterlace_frame_di_greedy_packed (GstDeinterlaceMethod * method, const GstDeinterlaceField * history, guint history_count, @@ -561,10 +300,6 @@ gst_deinterlace_method_greedy_l_class_init (GstDeinterlaceMethodGreedyLClass * { GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass; GObjectClass *gobject_class = (GObjectClass *) klass; -#ifdef BUILD_X86_ASM - guint cpu_flags = - orc_target_get_default_flags (orc_target_get_by_name ("mmx")); -#endif gobject_class->set_property = gst_deinterlace_method_greedy_l_set_property; gobject_class->get_property = gst_deinterlace_method_greedy_l_get_property; @@ -596,18 +331,7 @@ gst_deinterlace_method_greedy_l_class_init (GstDeinterlaceMethodGreedyLClass * dim_class->deinterlace_frame_rgb = deinterlace_frame_di_greedy_packed; dim_class->deinterlace_frame_bgr = deinterlace_frame_di_greedy_packed; -#ifdef BUILD_X86_ASM - if (cpu_flags & ORC_TARGET_MMX_MMXEXT) { - klass->scanline = deinterlace_greedy_scanline_mmxext; - } else if (cpu_flags & ORC_TARGET_MMX_MMX) { - klass->scanline = deinterlace_greedy_scanline_mmx; - } else { - klass->scanline = deinterlace_greedy_scanline_c; - } -#else - klass->scanline = deinterlace_greedy_scanline_c; klass->scanline = deinterlace_greedy_scanline_orc; -#endif } static void -- 2.7.4