From c43af9a8a3adc7bd3888e746ce7b7bd581c476ae Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Thu, 13 Jun 2013 11:07:12 -0700 Subject: [PATCH] Enable sse2 version of sad8x4/4x8 The encoding time for bus at CIF goes from 661s to 625s. This commit also enabled unit test of sad8x4/4x8 in sad_test.cc. Change-Id: If3d10ebb56bda584bdb69bcf056599d580b12cb1 --- test/sad_test.cc | 10 ++++++- vp9/common/vp9_rtcd_defs.sh | 4 +-- vp9/encoder/x86/vp9_sad_sse2.asm | 57 ++++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/test/sad_test.cc b/test/sad_test.cc index 9555a9a..7698c1e 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -308,6 +308,8 @@ const sad_m_by_n_fn_t sad_16x16_c_vp9 = vp9_sad16x16_c; const sad_m_by_n_fn_t sad_8x16_c_vp9 = vp9_sad8x16_c; const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c; const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c; +const sad_m_by_n_fn_t sad_8x4_c_vp9 = vp9_sad8x4_c; +const sad_m_by_n_fn_t sad_4x8_c_vp9 = vp9_sad4x8_c; const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c; #endif const sad_m_by_n_test_param_t c_tests[] = { @@ -325,6 +327,8 @@ const sad_m_by_n_test_param_t c_tests[] = { make_tuple(8, 16, sad_8x16_c_vp9), make_tuple(16, 8, sad_16x8_c_vp9), make_tuple(8, 8, sad_8x8_c_vp9), + make_tuple(8, 4, sad_8x4_c_vp9), + make_tuple(4, 8, sad_4x8_c_vp9), make_tuple(4, 4, sad_4x4_c_vp9), #endif }; @@ -404,8 +408,10 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests)); #if HAVE_SSE #if CONFIG_VP9_ENCODER const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse; +const sad_m_by_n_fn_t sad_4x8_sse_vp9 = vp9_sad4x8_sse; INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values( - make_tuple(4, 4, sad_4x4_sse_vp9))); + make_tuple(4, 4, sad_4x4_sse_vp9), + make_tuple(4, 8, sad_4x8_sse_vp9))); const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse; INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values( @@ -428,6 +434,7 @@ const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2; const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2; const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2; const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2; +const sad_m_by_n_fn_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2; #endif const sad_m_by_n_test_param_t sse2_tests[] = { #if CONFIG_VP8_ENCODER @@ -444,6 +451,7 @@ const sad_m_by_n_test_param_t sse2_tests[] = { make_tuple(8, 16, sad_8x16_sse2_vp9), make_tuple(16, 8, sad_16x8_sse2_vp9), make_tuple(8, 8, sad_8x8_sse2_vp9), + make_tuple(8, 4, sad_8x4_sse2_vp9), #endif }; INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests)); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index f281e08..8c78b7b 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -387,10 +387,10 @@ specialize vp9_sad8x8 mmx sse2 # TODO(jingning): need to covert these functions into mmx/sse2 form prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad8x4 +specialize vp9_sad8x4 sse2 prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp9_sad4x8 +specialize vp9_sad4x8 sse prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad4x4 mmx sse diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index ea92377..8fb7d41 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -166,29 +166,46 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 -; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -INIT_MMX sse -cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride +; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1 +cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided - movd m0, [refq] - movd m1, [refq+ref_strideq] + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 movd m2, [srcq] - movd m3, [srcq+src_strideq] - lea refq, [refq+ref_strideq*2] - lea srcq, [srcq+src_strideq*2] - movd m4, [refq] - movd m5, [refq+ref_strideq] - movd m6, [srcq] - movd m7, [srcq+src_strideq] - punpckldq m0, m1 - punpckldq m2, m3 - punpckldq m4, m5 - punpckldq m6, m7 - psadbw m0, m2 - psadbw m4, m6 - paddd m0, m4 + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m6, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m6 + psadbw m1, m2 + psadbw m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + movd eax, m0 RET +%endmacro + +INIT_MMX sse +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse -- 2.7.4