From: levytamar82 Date: Thu, 24 Jul 2014 07:54:59 +0000 (-0700) Subject: Fix bug 806 X-Git-Tag: v1.4.0~1070^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=af10457e02f608b79db1e2883b7780721cbf715d;p=platform%2Fupstream%2Flibvpx.git Fix bug 806 in the function sad32x32x4d and sad64x64x4d the source is aligned to 16 bytes and not to 32 bytes - the load is now unaligned. Change-Id: I922fdba56d0936b5cf72e4503519f185645a168c --- diff --git a/test/sad_test.cc b/test/sad_test.cc index f07a989..e63770b 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -640,19 +640,9 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values( #if HAVE_AVX2 #if CONFIG_VP9_ENCODER -// TODO(jzern): these prototypes can be removed after the avx2 versions are -// reenabled in vp9_rtcd_defs.pl. -extern "C" { -void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_ptr[], int ref_stride, - unsigned int *sad_array); -void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_ptr[], int ref_stride, - unsigned int *sad_array); -} const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2; const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2; -INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values( +INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values( make_tuple(32, 32, sad_32x32x4d_avx2), make_tuple(64, 64, sad_64x64x4d_avx2))); #endif // CONFIG_VP9_ENCODER diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 8d91791..36bdc8e 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -653,7 +653,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vp9_sad4x4x8 sse4/; add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad64x64x4d sse2/; +specialize qw/vp9_sad64x64x4d sse2 avx2/; add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad32x64x4d sse2/; @@ -668,7 +668,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, co specialize qw/vp9_sad16x32x4d sse2/; add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; -specialize qw/vp9_sad32x32x4d sse2/; +specialize qw/vp9_sad32x32x4d sse2 avx2/; add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad16x16x4d sse2/; diff --git a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c index f31b176..1feed62 100644 --- a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c @@ -31,7 +31,7 @@ void vp9_sad32x32x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 32 ; i++) { // load src and all refs - src_reg = _mm256_load_si256((__m256i *)(src)); + src_reg = _mm256_loadu_si256((__m256i *)(src)); ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); @@ -103,8 +103,8 @@ void vp9_sad64x64x4d_avx2(uint8_t *src, sum_ref3 = _mm256_set1_epi16(0); for (i = 0; i < 64 ; i++) { // load 64 bytes from src and all refs - src_reg = _mm256_load_si256((__m256i *)(src)); - srcnext_reg = _mm256_load_si256((__m256i *)(src + 32)); + src_reg = _mm256_loadu_si256((__m256i *)(src)); + srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));