From: Yunqing Wang Date: Wed, 28 Nov 2012 03:16:32 +0000 (-0800) Subject: Further improve macroblock loop filters X-Git-Tag: v1.3.0~1217^2~34^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d2021386214242d1553e686d704d9b4e127490c7;p=platform%2Fupstream%2Flibvpx.git Further improve macroblock loop filters This change included: 1. Aligned reads in vp9_mbloop_filter_vertical_edge function. Since we actually read 16 bytes, we can align the reads to read starting at (s - 8) instead of (s - 5). 2. Combined u, v loop filters. 3. Added 8x16 transpose. This gave 2% decoder performance gain (tulip clip). Change-Id: Ib14c2f1645c4a3436df17fe2f24789506bf0bb58 --- diff --git a/vp9/common/x86/vp9_loopfilter_x86.c b/vp9/common/x86/vp9_loopfilter_x86.c index af8bb69..54e6b9d 100644 --- a/vp9/common/x86/vp9_loopfilter_x86.c +++ b/vp9/common/x86/vp9_loopfilter_x86.c @@ -84,12 +84,11 @@ void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, #endif #if HAVE_SSE2 -void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - int count) { +void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); @@ -230,7 +229,7 @@ void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s, _mm_packus_epi16(workp_shft, workp_shft)); src += 8; - } while (++i < count); + } while (++i < 2); } // lp filter { @@ -325,24 +324,154 @@ void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - if (count == 1) { - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - } else { - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - } + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } } +void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, + unsigned char *v) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160); + + /* Read source */ + const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)), + _mm_loadl_epi64((__m128i *)(v - 5 * p))); + const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)), + _mm_loadl_epi64((__m128i *)(v - 4 * p))); + const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)), + _mm_loadl_epi64((__m128i *)(v - 3 * p))); + const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)), + _mm_loadl_epi64((__m128i *)(v - 2 * p))); + const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)), + _mm_loadl_epi64((__m128i *)(v - 1 * p))); + const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)), + _mm_loadl_epi64((__m128i *)(v))); + const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)), + _mm_loadl_epi64((__m128i *)(v + 1 * p))); + const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)), + _mm_loadl_epi64((__m128i *)(v + 2 * p))); + const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)), + _mm_loadl_epi64((__m128i *)(v + 3 * p))); + const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)), + _mm_loadl_epi64((__m128i *)(v + 4 * p))); + + _mm_store_si128((__m128i *)(src), p4); + _mm_store_si128((__m128i *)(src + 16), p3); + _mm_store_si128((__m128i *)(src + 32), p2); + _mm_store_si128((__m128i *)(src + 48), p1); + _mm_store_si128((__m128i *)(src + 64), p0); + _mm_store_si128((__m128i *)(src + 80), q0); + _mm_store_si128((__m128i *)(src + 96), q1); + _mm_store_si128((__m128i *)(src + 112), q2); + _mm_store_si128((__m128i *)(src + 128), q3); + _mm_store_si128((__m128i *)(src + 144), q4); + + /* Loop filtering */ + vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit, + _thresh); + + /* Store result */ + _mm_storel_epi64((__m128i *)(u - 3 * p), + _mm_loadl_epi64((__m128i *)(src + 32))); + _mm_storel_epi64((__m128i *)(u - 2 * p), + _mm_loadl_epi64((__m128i *)(src + 48))); + _mm_storel_epi64((__m128i *)(u - p), + _mm_loadl_epi64((__m128i *)(src + 64))); + _mm_storel_epi64((__m128i *)u, + _mm_loadl_epi64((__m128i *)(src + 80))); + _mm_storel_epi64((__m128i *)(u + p), + _mm_loadl_epi64((__m128i *)(src + 96))); + _mm_storel_epi64((__m128i *)(u + 2 * p), + _mm_loadl_epi64((__m128i *)(src + 112))); + + _mm_storel_epi64((__m128i *)(v - 3 * p), + _mm_loadl_epi64((__m128i *)(src + 40))); + _mm_storel_epi64((__m128i *)(v - 2 * p), + _mm_loadl_epi64((__m128i *)(src + 56))); + _mm_storel_epi64((__m128i *)(v - p), + _mm_loadl_epi64((__m128i *)(src + 72))); + _mm_storel_epi64((__m128i *)v, + _mm_loadl_epi64((__m128i *)(src + 88))); + _mm_storel_epi64((__m128i *)(v + p), + _mm_loadl_epi64((__m128i *)(src + 104))); + _mm_storel_epi64((__m128i *)(v + 2 * p), + _mm_loadl_epi64((__m128i *)(src + 120))); +} + +static __inline void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + /* Read in 16 lines */ + x0 = _mm_loadl_epi64((__m128i *)in0); + x8 = _mm_loadl_epi64((__m128i *)in1); + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); + + x0 = _mm_unpacklo_epi8(x0, x1); + x1 = _mm_unpacklo_epi8(x2, x3); + x2 = _mm_unpacklo_epi8(x4, x5); + x3 = _mm_unpacklo_epi8(x6, x7); + + x8 = _mm_unpacklo_epi8(x8, x9); + x9 = _mm_unpacklo_epi8(x10, x11); + x10 = _mm_unpacklo_epi8(x12, x13); + x11 = _mm_unpacklo_epi8(x14, x15); + + x4 = _mm_unpacklo_epi16(x0, x1); + x5 = _mm_unpacklo_epi16(x2, x3); + x12 = _mm_unpacklo_epi16(x8, x9); + x13 = _mm_unpacklo_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + /* Store first 4-line result */ + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + /* Store second 4-line result */ + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + static __inline void transpose(unsigned char *src[], int in_p, unsigned char *dst[], int out_p, int num_8x8_to_transpose) { @@ -406,40 +535,58 @@ static __inline void transpose(unsigned char *src[], int in_p, } while (++idx8x8 < num_8x8_to_transpose); } -void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]); - unsigned char *src[4]; - unsigned char *dst[4]; - - src[0] = s - 5; - src[1] = s - 5 + 8; - src[2] = s - 5 + p*8; - src[3] = s - 5 + p*8 + 8; - - dst[0] = t_dst; - dst[1] = t_dst + 16*8; - dst[2] = t_dst + 8; - dst[3] = t_dst + 16*8 + 8; - - // 16x16->16x16 or 16x8->8x16 - transpose(src, p, dst, 16, (1 << count)); - - vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit, - thresh, count); +void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + unsigned char *src[2]; + unsigned char *dst[2]; + + /* Transpose 16x16 */ + transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); + transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); + + /* Loop filtering */ + vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh); + src[0] = t_dst + 3 * 16; + src[1] = t_dst + 3 * 16 + 8; dst[0] = s - 5; - dst[1] = s - 5 + p*8; + dst[1] = s - 5 + p * 8; + + /* Transpose 16x8 */ + transpose(src, 16, dst, p, 2); +} - src[0] = t_dst; - src[1] = t_dst + 8; +void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + unsigned char *src[2]; + unsigned char *dst[2]; - // 16x8->8x16 or 8x8->8x8 - transpose(src, 16, dst, p, (1 << (count - 1))); + /* Transpose 16x16 */ + transpose8x16(u - 8, v - 8, p, t_dst, 16); + transpose8x16(u, v, p, t_dst + 16 * 8, 16); + + /* Loop filtering */ + vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh); + + src[0] = t_dst + 3 * 16; + src[1] = t_dst + 3 * 16 + 8; + + dst[0] = u - 5; + dst[1] = v - 5; + + /* Transpose 16x8 */ + transpose(src, 16, dst, p, 2); } /* Horizontal MB filtering */ @@ -447,48 +594,40 @@ void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 2); + vp9_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, + lfi->lim, lfi->hev_thr); - /* TODO: write sse2 version with u,v interleaved */ + /* u,v */ if (u_ptr) - vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, v_ptr); } void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_c_sse2( - y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_mbloop_filter_horizontal_edge_sse2( + y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); } /* Vertical MB Filtering */ void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vp9_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr); - /* TODO: write sse2 version with u,v interleaved */ + /* u,v */ if (u_ptr) - vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, v_ptr); } void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_c_sse2( - y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_mbloop_filter_vertical_edge_sse2( + y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr); } /* Horizontal B Filtering */