From: Johann Date: Fri, 15 May 2015 18:52:03 +0000 (-0700) Subject: Move variance functions to vpx_dsp X-Git-Tag: v1.5.0~651^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c3bdffb0a508ad08d5dfa613c029f368d4293d4c;p=platform%2Fupstream%2Flibvpx.git Move variance functions to vpx_dsp subpel functions will be moved in another patch. Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce --- diff --git a/test/variance_test.cc b/test/variance_test.cc index e4e27af..60ca815 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -21,17 +21,21 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#if CONFIG_VP8_ENCODER -# include "./vp8_rtcd.h" -# include "vp8/common/variance.h" -#endif #if CONFIG_VP9_ENCODER # include "./vp9_rtcd.h" # include "vp9/encoder/vp9_variance.h" -#endif +#endif // CONFIG_VP9_ENCODER +#include "./vpx_dsp_rtcd.h" namespace { +typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); +typedef unsigned int (*Get4x4SSEFunc)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + + using ::std::tr1::get; using ::std::tr1::make_tuple; using ::std::tr1::tuple; @@ -50,7 +54,6 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref, int ref_stride_coeff, uint32_t *sse_ptr, bool use_high_bit_depth_, vpx_bit_depth_t bit_depth) { -#if CONFIG_VP9_HIGHBITDEPTH int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -63,11 +66,13 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref, src[w * y * src_stride_coeff + x]; se += diff; sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH } else { diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] - CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x]; se += diff; sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH } } } @@ -75,20 +80,6 @@ static unsigned int variance_ref(const uint8_t *src, const uint8_t *ref, sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8)); se = ROUND_POWER_OF_TWO(se, bit_depth - 8); } -#else - int se = 0; - unsigned int sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - int diff = ref[w * y * ref_stride_coeff + x] - - src[w * y * src_stride_coeff + x]; - se += diff; - sse += diff * diff; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -98,7 +89,6 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, unsigned int *sse_ptr, bool use_high_bit_depth_, vpx_bit_depth_t bit_depth) { -#if CONFIG_VP9_HIGHBITDEPTH int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -117,6 +107,7 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, const int diff = r - src[w * y + x]; se += diff; sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH } else { uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); uint16_t *src16 = CONVERT_TO_SHORTPTR(src); @@ -130,6 +121,7 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, const int diff = r - src16[w * y + x]; se += diff; sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH } } } @@ -137,27 +129,6 @@ static unsigned int subpel_variance_ref(const uint8_t *ref, const uint8_t *src, sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8)); se = ROUND_POWER_OF_TWO(se, bit_depth - 8); } -#else - int se = 0; - unsigned int sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - // Bilinear interpolation at a 16th pel step. - const int a1 = ref[(w + 1) * (y + 0) + x + 0]; - const int a2 = ref[(w + 1) * (y + 0) + x + 1]; - const int b1 = ref[(w + 1) * (y + 1) + x + 0]; - const int b2 = ref[(w + 1) * (y + 1) + x + 1]; - const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); - const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); - const int r = a + (((b - a) * yoff + 8) >> 4); - const int diff = r - src[w * y + x]; - se += diff; - sse += diff * diff; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -229,36 +200,30 @@ class VarianceTest rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_ = reinterpret_cast(vpx_memalign(16, block_size_ * 2)); ref_ = new uint8_t[block_size_ * 2]; +#if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR(reinterpret_cast( vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t)))); ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - src_ = reinterpret_cast(vpx_memalign(16, block_size_ * 2)); - ref_ = new uint8_t[block_size_ * 2]; -#endif ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { vpx_free(src_); delete[] ref_; +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_free(CONVERT_TO_SHORTPTR(src_)); delete[] CONVERT_TO_SHORTPTR(ref_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - vpx_free(src_); - delete[] ref_; -#endif libvpx_test::ClearSystemState(); } @@ -283,27 +248,23 @@ class VarianceTest template void VarianceTest::ZeroTest() { for (int i = 0; i <= 255; ++i) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(src_, i, block_size_); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8), block_size_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - memset(src_, i, block_size_); -#endif for (int j = 0; j <= 255; ++j) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(ref_, j, block_size_); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j << (bit_depth_ - 8), block_size_); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - memset(ref_, j, block_size_); -#endif unsigned int sse; unsigned int var; ASM_REGISTER_STATE_CHECK( @@ -317,18 +278,15 @@ template void VarianceTest::RefTest() { for (int i = 0; i < 10; ++i) { for (int j = 0; j < block_size_; j++) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_[j] = rnd_.Rand8(); ref_[j] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH } else { CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_; CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_; +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - src_[j] = rnd_.Rand8(); - ref_[j] = rnd_.Rand8(); -#endif } unsigned int sse1, sse2; unsigned int var1; @@ -352,18 +310,15 @@ void VarianceTest::RefStrideTest() { for (int j = 0; j < block_size_; j++) { int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_; int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_[src_ind] = rnd_.Rand8(); ref_[ref_ind] = rnd_.Rand8(); +#if CONFIG_VP9_HIGHBITDEPTH } else { CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_; CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_; +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - src_[src_ind] = rnd_.Rand8(); - ref_[ref_ind] = rnd_.Rand8(); -#endif } unsigned int sse1, sse2; unsigned int var1; @@ -383,22 +338,18 @@ void VarianceTest::RefStrideTest() { template void VarianceTest::OneQuarterTest() { const int half = block_size_ / 2; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(src_, 255, block_size_); memset(ref_, 255, half); memset(ref_ + half, 0, half); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8), block_size_); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - memset(src_, 255, block_size_); - memset(ref_, 255, half); - memset(ref_ + half, 0, half); -#endif unsigned int sse; unsigned int var; ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse)); @@ -406,7 +357,6 @@ void VarianceTest::OneQuarterTest() { EXPECT_EQ(expected, var); } -#if CONFIG_VP8_ENCODER template class MseTest : public ::testing::TestWithParam > { @@ -500,9 +450,7 @@ void MseTest::MaxTest_sse() { const unsigned int expected = block_size_ * 255 * 255; EXPECT_EQ(expected, var); } -#endif -#if CONFIG_VP9_ENCODER unsigned int subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, @@ -511,7 +459,6 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, unsigned int *sse_ptr, bool use_high_bit_depth, vpx_bit_depth_t bit_depth) { -#if CONFIG_VP9_HIGHBITDEPTH int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -530,6 +477,7 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; se += diff; sse += diff * diff; +#if CONFIG_VP9_HIGHBITDEPTH } else { uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref); uint16_t *src16 = CONVERT_TO_SHORTPTR(src); @@ -544,6 +492,7 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x]; se += diff; sse += diff * diff; +#endif // CONFIG_VP9_HIGHBITDEPTH } } } @@ -551,27 +500,6 @@ unsigned int subpel_avg_variance_ref(const uint8_t *ref, sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8)); se = ROUND_POWER_OF_TWO(se, bit_depth-8); } -#else - int se = 0; - unsigned int sse = 0; - const int w = 1 << l2w; - const int h = 1 << l2h; - for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) { - // bilinear interpolation at a 16th pel step - const int a1 = ref[(w + 1) * (y + 0) + x + 0]; - const int a2 = ref[(w + 1) * (y + 0) + x + 1]; - const int b1 = ref[(w + 1) * (y + 1) + x + 0]; - const int b2 = ref[(w + 1) * (y + 1) + x + 1]; - const int a = a1 + (((a2 - a1) * xoff + 8) >> 4); - const int b = b1 + (((b2 - b1) * xoff + 8) >> 4); - const int r = a + (((b - a) * yoff + 8) >> 4); - const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x]; - se += diff; - sse += diff * diff; - } - } -#endif // CONFIG_VP9_HIGHBITDEPTH *sse_ptr = sse; return sse - (((int64_t) se * se) >> (l2w + l2h)); } @@ -600,11 +528,11 @@ class SubpelVarianceTest rnd_.Reset(ACMRandom::DeterministicSeed()); block_size_ = width_ * height_; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { src_ = reinterpret_cast(vpx_memalign(16, block_size_)); sec_ = reinterpret_cast(vpx_memalign(16, block_size_)); ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; +#if CONFIG_VP9_HIGHBITDEPTH } else { src_ = CONVERT_TO_BYTEPTR( reinterpret_cast( @@ -614,33 +542,25 @@ class SubpelVarianceTest vpx_memalign(16, block_size_*sizeof(uint16_t)))); ref_ = CONVERT_TO_BYTEPTR( new uint16_t[block_size_ + width_ + height_ + 1]); - } -#else - src_ = reinterpret_cast(vpx_memalign(16, block_size_)); - sec_ = reinterpret_cast(vpx_memalign(16, block_size_)); - ref_ = new uint8_t[block_size_ + width_ + height_ + 1]; #endif // CONFIG_VP9_HIGHBITDEPTH + } ASSERT_TRUE(src_ != NULL); ASSERT_TRUE(sec_ != NULL); ASSERT_TRUE(ref_ != NULL); } virtual void TearDown() { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { vpx_free(src_); delete[] ref_; vpx_free(sec_); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_free(CONVERT_TO_SHORTPTR(src_)); delete[] CONVERT_TO_SHORTPTR(ref_); vpx_free(CONVERT_TO_SHORTPTR(sec_)); +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - vpx_free(src_); - delete[] ref_; - vpx_free(sec_); -#endif libvpx_test::ClearSystemState(); } @@ -664,7 +584,6 @@ template void SubpelVarianceTest::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { for (int j = 0; j < block_size_; j++) { src_[j] = rnd_.Rand8(); @@ -672,6 +591,7 @@ void SubpelVarianceTest::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { ref_[j] = rnd_.Rand8(); } +#if CONFIG_VP9_HIGHBITDEPTH } else { for (int j = 0; j < block_size_; j++) { CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; @@ -679,15 +599,8 @@ void SubpelVarianceTest::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; } - } -#else - for (int j = 0; j < block_size_; j++) { - src_[j] = rnd_.Rand8(); - } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - ref_[j] = rnd_.Rand8(); - } #endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y, @@ -710,25 +623,20 @@ void SubpelVarianceTest::ExtremeRefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { const int half = block_size_ / 2; -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { memset(src_, 0, half); memset(src_ + half, 255, half); memset(ref_, 255, half); memset(ref_ + half, 0, half + width_ + height_ + 1); +#if CONFIG_VP9_HIGHBITDEPTH } else { vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half); vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half); vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_, half + width_ + height_ + 1); - } -#else - memset(src_, 0, half); - memset(src_ + half, 255, half); - memset(ref_, 255, half); - memset(ref_ + half, 0, half + width_ + height_ + 1); #endif // CONFIG_VP9_HIGHBITDEPTH + } unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( @@ -742,11 +650,11 @@ void SubpelVarianceTest::ExtremeRefTest() { } } +#if CONFIG_VP9_ENCODER template<> void SubpelVarianceTest::RefTest() { for (int x = 0; x < 16; ++x) { for (int y = 0; y < 16; ++y) { -#if CONFIG_VP9_HIGHBITDEPTH if (!use_high_bit_depth_) { for (int j = 0; j < block_size_; j++) { src_[j] = rnd_.Rand8(); @@ -755,6 +663,7 @@ void SubpelVarianceTest::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { ref_[j] = rnd_.Rand8(); } +#if CONFIG_VP9_HIGHBITDEPTH } else { for (int j = 0; j < block_size_; j++) { CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_; @@ -763,16 +672,8 @@ void SubpelVarianceTest::RefTest() { for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_; } +#endif // CONFIG_VP9_HIGHBITDEPTH } -#else - for (int j = 0; j < block_size_; j++) { - src_[j] = rnd_.Rand8(); - sec_[j] = rnd_.Rand8(); - } - for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) { - ref_[j] = rnd_.Rand8(); - } -#endif unsigned int sse1, sse2; unsigned int var1; ASM_REGISTER_STATE_CHECK( @@ -788,272 +689,407 @@ void SubpelVarianceTest::RefTest() { } } } - #endif // CONFIG_VP9_ENCODER -// ----------------------------------------------------------------------------- -// VP8 test cases. - -namespace vp8 { - -#if CONFIG_VP8_ENCODER -typedef unsigned int (*vp8_sse_fn_t)(const unsigned char *src_ptr, - int source_stride, const unsigned char *ref_ptr, int ref_stride); - -typedef MseTest VP8SseTest; -typedef MseTest VP8MseTest; -typedef VarianceTest VP8VarianceTest; - -TEST_P(VP8SseTest, Ref_sse) { RefTest_sse(); } -TEST_P(VP8SseTest, Max_sse) { MaxTest_sse(); } -TEST_P(VP8MseTest, Ref_mse) { RefTest_mse(); } -TEST_P(VP8MseTest, Max_mse) { MaxTest_mse(); } -TEST_P(VP8VarianceTest, Zero) { ZeroTest(); } -TEST_P(VP8VarianceTest, Ref) { RefTest(); } -TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); } +typedef MseTest VpxSseTest; +typedef MseTest VpxMseTest; +typedef VarianceTest VpxVarianceTest; + +TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); } +TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); } +TEST_P(VpxMseTest, Ref_mse) { RefTest_mse(); } +TEST_P(VpxMseTest, Max_mse) { MaxTest_mse(); } +TEST_P(VpxVarianceTest, Zero) { ZeroTest(); } +TEST_P(VpxVarianceTest, Ref) { RefTest(); } +TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(SumOfSquaresTest, Const) { ConstTest(); } +TEST_P(SumOfSquaresTest, Ref) { RefTest(); } -const vp8_sse_fn_t get4x4sse_cs_c = vp8_get4x4sse_cs_c; -INSTANTIATE_TEST_CASE_P( - C, VP8SseTest, - ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c))); +INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_c)); + +const Get4x4SSEFunc get4x4sse_cs_c = vpx_get4x4sse_cs_c; +INSTANTIATE_TEST_CASE_P(C, VpxSseTest, + ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c))); + +const VarianceMxNFunc mse16x16_c = vpx_mse16x16_c; +const VarianceMxNFunc mse16x8_c = vpx_mse16x8_c; +const VarianceMxNFunc mse8x16_c = vpx_mse8x16_c; +const VarianceMxNFunc mse8x8_c = vpx_mse8x8_c; +INSTANTIATE_TEST_CASE_P(C, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_c), + make_tuple(4, 3, mse16x8_c), + make_tuple(3, 4, mse8x16_c), + make_tuple(3, 3, mse8x8_c))); + +const VarianceMxNFunc variance64x64_c = vpx_variance64x64_c; +const VarianceMxNFunc variance64x32_c = vpx_variance64x32_c; +const VarianceMxNFunc variance32x64_c = vpx_variance32x64_c; +const VarianceMxNFunc variance32x32_c = vpx_variance32x32_c; +const VarianceMxNFunc variance32x16_c = vpx_variance32x16_c; +const VarianceMxNFunc variance16x32_c = vpx_variance16x32_c; +const VarianceMxNFunc variance16x16_c = vpx_variance16x16_c; +const VarianceMxNFunc variance16x8_c = vpx_variance16x8_c; +const VarianceMxNFunc variance8x16_c = vpx_variance8x16_c; +const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c; +const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c; +const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c; +const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c; -const vp8_variance_fn_t mse16x16_c = vp8_mse16x16_c; INSTANTIATE_TEST_CASE_P( - C, VP8MseTest, - ::testing::Values(make_tuple(4, 4, mse16x16_c))); - -const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c; -const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c; -const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c; -const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c; -const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c; -INSTANTIATE_TEST_CASE_P( - C, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_c, 0), - make_tuple(3, 3, variance8x8_c, 0), - make_tuple(3, 4, variance8x16_c, 0), + C, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_c, 0), + make_tuple(6, 5, variance64x32_c, 0), + make_tuple(5, 6, variance32x64_c, 0), + make_tuple(5, 5, variance32x32_c, 0), + make_tuple(5, 4, variance32x16_c, 0), + make_tuple(4, 5, variance16x32_c, 0), + make_tuple(4, 4, variance16x16_c, 0), make_tuple(4, 3, variance16x8_c, 0), - make_tuple(4, 4, variance16x16_c, 0))); + make_tuple(3, 4, variance8x16_c, 0), + make_tuple(3, 3, variance8x8_c, 0), + make_tuple(3, 2, variance8x4_c, 0), + make_tuple(2, 3, variance4x8_c, 0), + make_tuple(2, 2, variance4x4_c, 0))); -#if HAVE_NEON -const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon; -INSTANTIATE_TEST_CASE_P( - NEON, VP8SseTest, - ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon))); +#if CONFIG_VP9_HIGHBITDEPTH +typedef MseTest VpxHBDMseTest; +typedef VarianceTest VpxHBDVarianceTest; + +TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); } +TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); } +TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } +TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } +TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } +TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } + +/* TODO(debargha): This test does not support the highbd version +const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c; +const VarianceMxNFunc highbd_12_mse16x8_c = vpx_highbd_12_mse16x8_c; +const VarianceMxNFunc highbd_12_mse8x16_c = vpx_highbd_12_mse8x16_c; +const VarianceMxNFunc highbd_12_mse8x8_c = vpx_highbd_12_mse8x8_c; + +const VarianceMxNFunc highbd_10_mse16x16_c = vpx_highbd_10_mse16x16_c; +const VarianceMxNFunc highbd_10_mse16x8_c = vpx_highbd_10_mse16x8_c; +const VarianceMxNFunc highbd_10_mse8x16_c = vpx_highbd_10_mse8x16_c; +const VarianceMxNFunc highbd_10_mse8x8_c = vpx_highbd_10_mse8x8_c; + +const VarianceMxNFunc highbd_8_mse16x16_c = vpx_highbd_8_mse16x16_c; +const VarianceMxNFunc highbd_8_mse16x8_c = vpx_highbd_8_mse16x8_c; +const VarianceMxNFunc highbd_8_mse8x16_c = vpx_highbd_8_mse8x16_c; +const VarianceMxNFunc highbd_8_mse8x8_c = vpx_highbd_8_mse8x8_c; -const vp8_variance_fn_t mse16x16_neon = vp8_mse16x16_neon; INSTANTIATE_TEST_CASE_P( - NEON, VP8MseTest, - ::testing::Values(make_tuple(4, 4, mse16x16_neon))); - -const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon; -const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon; -const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon; -const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon; + C, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_c), + make_tuple(4, 4, highbd_12_mse16x8_c), + make_tuple(4, 4, highbd_12_mse8x16_c), + make_tuple(4, 4, highbd_12_mse8x8_c), + make_tuple(4, 4, highbd_10_mse16x16_c), + make_tuple(4, 4, highbd_10_mse16x8_c), + make_tuple(4, 4, highbd_10_mse8x16_c), + make_tuple(4, 4, highbd_10_mse8x8_c), + make_tuple(4, 4, highbd_8_mse16x16_c), + make_tuple(4, 4, highbd_8_mse16x8_c), + make_tuple(4, 4, highbd_8_mse8x16_c), + make_tuple(4, 4, highbd_8_mse8x8_c))); +*/ + + +const VarianceMxNFunc highbd_12_variance64x64_c = vpx_highbd_12_variance64x64_c; +const VarianceMxNFunc highbd_12_variance64x32_c = vpx_highbd_12_variance64x32_c; +const VarianceMxNFunc highbd_12_variance32x64_c = vpx_highbd_12_variance32x64_c; +const VarianceMxNFunc highbd_12_variance32x32_c = vpx_highbd_12_variance32x32_c; +const VarianceMxNFunc highbd_12_variance32x16_c = vpx_highbd_12_variance32x16_c; +const VarianceMxNFunc highbd_12_variance16x32_c = vpx_highbd_12_variance16x32_c; +const VarianceMxNFunc highbd_12_variance16x16_c = vpx_highbd_12_variance16x16_c; +const VarianceMxNFunc highbd_12_variance16x8_c = vpx_highbd_12_variance16x8_c; +const VarianceMxNFunc highbd_12_variance8x16_c = vpx_highbd_12_variance8x16_c; +const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c; +const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c; +const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c; +const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c; + +const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c; +const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c; +const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c; +const VarianceMxNFunc highbd_10_variance32x32_c = vpx_highbd_10_variance32x32_c; +const VarianceMxNFunc highbd_10_variance32x16_c = vpx_highbd_10_variance32x16_c; +const VarianceMxNFunc highbd_10_variance16x32_c = vpx_highbd_10_variance16x32_c; +const VarianceMxNFunc highbd_10_variance16x16_c = vpx_highbd_10_variance16x16_c; +const VarianceMxNFunc highbd_10_variance16x8_c = vpx_highbd_10_variance16x8_c; +const VarianceMxNFunc highbd_10_variance8x16_c = vpx_highbd_10_variance8x16_c; +const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c; +const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c; +const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c; +const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c; + +const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c; +const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c; +const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c; +const VarianceMxNFunc highbd_8_variance32x32_c = vpx_highbd_8_variance32x32_c; +const VarianceMxNFunc highbd_8_variance32x16_c = vpx_highbd_8_variance32x16_c; +const VarianceMxNFunc highbd_8_variance16x32_c = vpx_highbd_8_variance16x32_c; +const VarianceMxNFunc highbd_8_variance16x16_c = vpx_highbd_8_variance16x16_c; +const VarianceMxNFunc highbd_8_variance16x8_c = vpx_highbd_8_variance16x8_c; +const VarianceMxNFunc highbd_8_variance8x16_c = vpx_highbd_8_variance8x16_c; +const VarianceMxNFunc highbd_8_variance8x8_c = vpx_highbd_8_variance8x8_c; +const VarianceMxNFunc highbd_8_variance8x4_c = vpx_highbd_8_variance8x4_c; +const VarianceMxNFunc highbd_8_variance4x8_c = vpx_highbd_8_variance4x8_c; +const VarianceMxNFunc highbd_8_variance4x4_c = vpx_highbd_8_variance4x4_c; INSTANTIATE_TEST_CASE_P( - NEON, VP8VarianceTest, - ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), - make_tuple(3, 4, variance8x16_neon, 0), - make_tuple(4, 3, variance16x8_neon, 0), - make_tuple(4, 4, variance16x16_neon, 0))); -#endif + C, VpxHBDVarianceTest, + ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_c, 12), + make_tuple(6, 5, highbd_12_variance64x32_c, 12), + make_tuple(5, 6, highbd_12_variance32x64_c, 12), + make_tuple(5, 5, highbd_12_variance32x32_c, 12), + make_tuple(5, 4, highbd_12_variance32x16_c, 12), + make_tuple(4, 5, highbd_12_variance16x32_c, 12), + make_tuple(4, 4, highbd_12_variance16x16_c, 12), + make_tuple(4, 3, highbd_12_variance16x8_c, 12), + make_tuple(3, 4, highbd_12_variance8x16_c, 12), + make_tuple(3, 3, highbd_12_variance8x8_c, 12), + make_tuple(3, 2, highbd_12_variance8x4_c, 12), + make_tuple(2, 3, highbd_12_variance4x8_c, 12), + make_tuple(2, 2, highbd_12_variance4x4_c, 12), + make_tuple(6, 6, highbd_10_variance64x64_c, 10), + make_tuple(6, 5, highbd_10_variance64x32_c, 10), + make_tuple(5, 6, highbd_10_variance32x64_c, 10), + make_tuple(5, 5, highbd_10_variance32x32_c, 10), + make_tuple(5, 4, highbd_10_variance32x16_c, 10), + make_tuple(4, 5, highbd_10_variance16x32_c, 10), + make_tuple(4, 4, highbd_10_variance16x16_c, 10), + make_tuple(4, 3, highbd_10_variance16x8_c, 10), + make_tuple(3, 4, highbd_10_variance8x16_c, 10), + make_tuple(3, 3, highbd_10_variance8x8_c, 10), + make_tuple(3, 2, highbd_10_variance8x4_c, 10), + make_tuple(2, 3, highbd_10_variance4x8_c, 10), + make_tuple(2, 2, highbd_10_variance4x4_c, 10), + make_tuple(6, 6, highbd_8_variance64x64_c, 8), + make_tuple(6, 5, highbd_8_variance64x32_c, 8), + make_tuple(5, 6, highbd_8_variance32x64_c, 8), + make_tuple(5, 5, highbd_8_variance32x32_c, 8), + make_tuple(5, 4, highbd_8_variance32x16_c, 8), + make_tuple(4, 5, highbd_8_variance16x32_c, 8), + make_tuple(4, 4, highbd_8_variance16x16_c, 8), + make_tuple(4, 3, highbd_8_variance16x8_c, 8), + make_tuple(3, 4, highbd_8_variance8x16_c, 8), + make_tuple(3, 3, highbd_8_variance8x8_c, 8), + make_tuple(3, 2, highbd_8_variance8x4_c, 8), + make_tuple(2, 3, highbd_8_variance4x8_c, 8), + make_tuple(2, 2, highbd_8_variance4x4_c, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_MMX -const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; -const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx; -const vp8_variance_fn_t variance8x16_mmx = vp8_variance8x16_mmx; -const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx; -const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx; +const VarianceMxNFunc mse16x16_mmx = vpx_mse16x16_mmx; +INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_mmx))); + +INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_mmx)); + +const VarianceMxNFunc variance16x16_mmx = vpx_variance16x16_mmx; +const VarianceMxNFunc variance16x8_mmx = vpx_variance16x8_mmx; +const VarianceMxNFunc variance8x16_mmx = vpx_variance8x16_mmx; +const VarianceMxNFunc variance8x8_mmx = vpx_variance8x8_mmx; +const VarianceMxNFunc variance4x4_mmx = vpx_variance4x4_mmx; INSTANTIATE_TEST_CASE_P( - MMX, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0), - make_tuple(3, 3, variance8x8_mmx, 0), - make_tuple(3, 4, variance8x16_mmx, 0), + MMX, VpxVarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_mmx, 0), make_tuple(4, 3, variance16x8_mmx, 0), - make_tuple(4, 4, variance16x16_mmx, 0))); -#endif + make_tuple(3, 4, variance8x16_mmx, 0), + make_tuple(3, 3, variance8x8_mmx, 0), + make_tuple(2, 2, variance4x4_mmx, 0))); +#endif // HAVE_MMX #if HAVE_SSE2 -const vp8_variance_fn_t variance4x4_wmt = vp8_variance4x4_wmt; -const vp8_variance_fn_t variance8x8_wmt = vp8_variance8x8_wmt; -const vp8_variance_fn_t variance8x16_wmt = vp8_variance8x16_wmt; -const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt; -const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt; +INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, + ::testing::Values(vpx_get_mb_ss_sse2)); + +const VarianceMxNFunc mse16x16_sse2 = vpx_mse16x16_sse2; +const VarianceMxNFunc mse16x8_sse2 = vpx_mse16x8_sse2; +const VarianceMxNFunc mse8x16_sse2 = vpx_mse8x16_sse2; +const VarianceMxNFunc mse8x8_sse2 = vpx_mse8x8_sse2; +INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_sse2), + make_tuple(4, 3, mse16x8_sse2), + make_tuple(3, 4, mse8x16_sse2), + make_tuple(3, 3, mse8x8_sse2))); + +const VarianceMxNFunc variance64x64_sse2 = vpx_variance64x64_sse2; +const VarianceMxNFunc variance64x32_sse2 = vpx_variance64x32_sse2; +const VarianceMxNFunc variance32x64_sse2 = vpx_variance32x64_sse2; +const VarianceMxNFunc variance32x32_sse2 = vpx_variance32x32_sse2; +const VarianceMxNFunc variance32x16_sse2 = vpx_variance32x16_sse2; +const VarianceMxNFunc variance16x32_sse2 = vpx_variance16x32_sse2; +const VarianceMxNFunc variance16x16_sse2 = vpx_variance16x16_sse2; +const VarianceMxNFunc variance16x8_sse2 = vpx_variance16x8_sse2; +const VarianceMxNFunc variance8x16_sse2 = vpx_variance8x16_sse2; +const VarianceMxNFunc variance8x8_sse2 = vpx_variance8x8_sse2; +const VarianceMxNFunc variance8x4_sse2 = vpx_variance8x4_sse2; +const VarianceMxNFunc variance4x8_sse2 = vpx_variance4x8_sse2; +const VarianceMxNFunc variance4x4_sse2 = vpx_variance4x4_sse2; INSTANTIATE_TEST_CASE_P( - SSE2, VP8VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0), - make_tuple(3, 3, variance8x8_wmt, 0), - make_tuple(3, 4, variance8x16_wmt, 0), - make_tuple(4, 3, variance16x8_wmt, 0), - make_tuple(4, 4, variance16x16_wmt, 0))); -#endif -#endif // CONFIG_VP8_ENCODER - -} // namespace vp8 + SSE2, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_sse2, 0), + make_tuple(6, 5, variance64x32_sse2, 0), + make_tuple(5, 6, variance32x64_sse2, 0), + make_tuple(5, 5, variance32x32_sse2, 0), + make_tuple(5, 4, variance32x16_sse2, 0), + make_tuple(4, 5, variance16x32_sse2, 0), + make_tuple(4, 4, variance16x16_sse2, 0), + make_tuple(4, 3, variance16x8_sse2, 0), + make_tuple(3, 4, variance8x16_sse2, 0), + make_tuple(3, 3, variance8x8_sse2, 0), + make_tuple(3, 2, variance8x4_sse2, 0), + make_tuple(2, 3, variance4x8_sse2, 0), + make_tuple(2, 2, variance4x4_sse2, 0))); +#if CONFIG_VP9_HIGHBITDEPTH +/* TODO(debargha): This test does not support the highbd version +const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2; +const VarianceMxNFunc highbd_12_mse16x8_sse2 = vpx_highbd_12_mse16x8_sse2; +const VarianceMxNFunc highbd_12_mse8x16_sse2 = vpx_highbd_12_mse8x16_sse2; +const VarianceMxNFunc highbd_12_mse8x8_sse2 = vpx_highbd_12_mse8x8_sse2; + +const VarianceMxNFunc highbd_10_mse16x16_sse2 = vpx_highbd_10_mse16x16_sse2; +const VarianceMxNFunc highbd_10_mse16x8_sse2 = vpx_highbd_10_mse16x8_sse2; +const VarianceMxNFunc highbd_10_mse8x16_sse2 = vpx_highbd_10_mse8x16_sse2; +const VarianceMxNFunc highbd_10_mse8x8_sse2 = vpx_highbd_10_mse8x8_sse2; + +const VarianceMxNFunc highbd_8_mse16x16_sse2 = vpx_highbd_8_mse16x16_sse2; +const VarianceMxNFunc highbd_8_mse16x8_sse2 = vpx_highbd_8_mse16x8_sse2; +const VarianceMxNFunc highbd_8_mse8x16_sse2 = vpx_highbd_8_mse8x16_sse2; +const VarianceMxNFunc highbd_8_mse8x8_sse2 = vpx_highbd_8_mse8x8_sse2; -// ----------------------------------------------------------------------------- -// VP9 test cases. +INSTANTIATE_TEST_CASE_P( + SSE2, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_sse2), + make_tuple(4, 3, highbd_12_mse16x8_sse2), + make_tuple(3, 4, highbd_12_mse8x16_sse2), + make_tuple(3, 3, highbd_12_mse8x8_sse2), + make_tuple(4, 4, highbd_10_mse16x16_sse2), + make_tuple(4, 3, highbd_10_mse16x8_sse2), + make_tuple(3, 4, highbd_10_mse8x16_sse2), + make_tuple(3, 3, highbd_10_mse8x8_sse2), + make_tuple(4, 4, highbd_8_mse16x16_sse2), + make_tuple(4, 3, highbd_8_mse16x8_sse2), + make_tuple(3, 4, highbd_8_mse8x16_sse2), + make_tuple(3, 3, highbd_8_mse8x8_sse2))); +*/ + +const VarianceMxNFunc highbd_12_variance64x64_sse2 = + vpx_highbd_12_variance64x64_sse2; +const VarianceMxNFunc highbd_12_variance64x32_sse2 = + vpx_highbd_12_variance64x32_sse2; +const VarianceMxNFunc highbd_12_variance32x64_sse2 = + vpx_highbd_12_variance32x64_sse2; +const VarianceMxNFunc highbd_12_variance32x32_sse2 = + vpx_highbd_12_variance32x32_sse2; +const VarianceMxNFunc highbd_12_variance32x16_sse2 = + vpx_highbd_12_variance32x16_sse2; +const VarianceMxNFunc highbd_12_variance16x32_sse2 = + vpx_highbd_12_variance16x32_sse2; +const VarianceMxNFunc highbd_12_variance16x16_sse2 = + vpx_highbd_12_variance16x16_sse2; +const VarianceMxNFunc highbd_12_variance16x8_sse2 = + vpx_highbd_12_variance16x8_sse2; +const VarianceMxNFunc highbd_12_variance8x16_sse2 = + vpx_highbd_12_variance8x16_sse2; +const VarianceMxNFunc highbd_12_variance8x8_sse2 = + vpx_highbd_12_variance8x8_sse2; +const VarianceMxNFunc highbd_10_variance64x64_sse2 = + vpx_highbd_10_variance64x64_sse2; +const VarianceMxNFunc highbd_10_variance64x32_sse2 = + vpx_highbd_10_variance64x32_sse2; +const VarianceMxNFunc highbd_10_variance32x64_sse2 = + vpx_highbd_10_variance32x64_sse2; +const VarianceMxNFunc highbd_10_variance32x32_sse2 = + vpx_highbd_10_variance32x32_sse2; +const VarianceMxNFunc highbd_10_variance32x16_sse2 = + vpx_highbd_10_variance32x16_sse2; +const VarianceMxNFunc highbd_10_variance16x32_sse2 = + vpx_highbd_10_variance16x32_sse2; +const VarianceMxNFunc highbd_10_variance16x16_sse2 = + vpx_highbd_10_variance16x16_sse2; +const VarianceMxNFunc highbd_10_variance16x8_sse2 = + vpx_highbd_10_variance16x8_sse2; +const VarianceMxNFunc highbd_10_variance8x16_sse2 = + vpx_highbd_10_variance8x16_sse2; +const VarianceMxNFunc highbd_10_variance8x8_sse2 = + vpx_highbd_10_variance8x8_sse2; +const VarianceMxNFunc highbd_8_variance64x64_sse2 = + vpx_highbd_8_variance64x64_sse2; +const VarianceMxNFunc highbd_8_variance64x32_sse2 = + vpx_highbd_8_variance64x32_sse2; +const VarianceMxNFunc highbd_8_variance32x64_sse2 = + vpx_highbd_8_variance32x64_sse2; +const VarianceMxNFunc highbd_8_variance32x32_sse2 = + vpx_highbd_8_variance32x32_sse2; +const VarianceMxNFunc highbd_8_variance32x16_sse2 = + vpx_highbd_8_variance32x16_sse2; +const VarianceMxNFunc highbd_8_variance16x32_sse2 = + vpx_highbd_8_variance16x32_sse2; +const VarianceMxNFunc highbd_8_variance16x16_sse2 = + vpx_highbd_8_variance16x16_sse2; +const VarianceMxNFunc highbd_8_variance16x8_sse2 = + vpx_highbd_8_variance16x8_sse2; +const VarianceMxNFunc highbd_8_variance8x16_sse2 = + vpx_highbd_8_variance8x16_sse2; +const VarianceMxNFunc highbd_8_variance8x8_sse2 = + vpx_highbd_8_variance8x8_sse2; -namespace vp9 { +INSTANTIATE_TEST_CASE_P( + SSE2, VpxHBDVarianceTest, + ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_sse2, 12), + make_tuple(6, 5, highbd_12_variance64x32_sse2, 12), + make_tuple(5, 6, highbd_12_variance32x64_sse2, 12), + make_tuple(5, 5, highbd_12_variance32x32_sse2, 12), + make_tuple(5, 4, highbd_12_variance32x16_sse2, 12), + make_tuple(4, 5, highbd_12_variance16x32_sse2, 12), + make_tuple(4, 4, highbd_12_variance16x16_sse2, 12), + make_tuple(4, 3, highbd_12_variance16x8_sse2, 12), + make_tuple(3, 4, highbd_12_variance8x16_sse2, 12), + make_tuple(3, 3, highbd_12_variance8x8_sse2, 12), + make_tuple(6, 6, highbd_10_variance64x64_sse2, 10), + make_tuple(6, 5, highbd_10_variance64x32_sse2, 10), + make_tuple(5, 6, highbd_10_variance32x64_sse2, 10), + make_tuple(5, 5, highbd_10_variance32x32_sse2, 10), + make_tuple(5, 4, highbd_10_variance32x16_sse2, 10), + make_tuple(4, 5, highbd_10_variance16x32_sse2, 10), + make_tuple(4, 4, highbd_10_variance16x16_sse2, 10), + make_tuple(4, 3, highbd_10_variance16x8_sse2, 10), + make_tuple(3, 4, highbd_10_variance8x16_sse2, 10), + make_tuple(3, 3, highbd_10_variance8x8_sse2, 10), + make_tuple(6, 6, highbd_8_variance64x64_sse2, 8), + make_tuple(6, 5, highbd_8_variance64x32_sse2, 8), + make_tuple(5, 6, highbd_8_variance32x64_sse2, 8), + make_tuple(5, 5, highbd_8_variance32x32_sse2, 8), + make_tuple(5, 4, highbd_8_variance32x16_sse2, 8), + make_tuple(4, 5, highbd_8_variance16x32_sse2, 8), + make_tuple(4, 4, highbd_8_variance16x16_sse2, 8), + make_tuple(4, 3, highbd_8_variance16x8_sse2, 8), + make_tuple(3, 4, highbd_8_variance8x16_sse2, 8), + make_tuple(3, 3, highbd_8_variance8x8_sse2, 8))); +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 #if CONFIG_VP9_ENCODER -TEST_P(SumOfSquaresTest, Const) { ConstTest(); } -TEST_P(SumOfSquaresTest, Ref) { RefTest(); } - -INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, - ::testing::Values(vp9_get_mb_ss_c)); - -typedef VarianceTest VP9VarianceTest; typedef SubpelVarianceTest VP9SubpelVarianceTest; typedef SubpelVarianceTest VP9SubpelAvgVarianceTest; -TEST_P(VP9VarianceTest, Zero) { ZeroTest(); } -TEST_P(VP9VarianceTest, Ref) { RefTest(); } -TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); } TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); } -TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); } #if CONFIG_VP9_HIGHBITDEPTH -typedef VarianceTest VP9VarianceHighTest; typedef SubpelVarianceTest VP9SubpelVarianceHighTest; typedef SubpelVarianceTest VP9SubpelAvgVarianceHighTest; -TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); } -TEST_P(VP9VarianceHighTest, Ref) { RefTest(); } -TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); } TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); } TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); } -TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); } #endif // CONFIG_VP9_HIGHBITDEPTH -const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c; -const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c; -const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c; -const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c; -const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c; -const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c; -const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c; -const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c; -const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c; -const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c; -const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c; -const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c; -const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_c, 0), - make_tuple(2, 3, variance4x8_c, 0), - make_tuple(3, 2, variance8x4_c, 0), - make_tuple(3, 3, variance8x8_c, 0), - make_tuple(3, 4, variance8x16_c, 0), - make_tuple(4, 3, variance16x8_c, 0), - make_tuple(4, 4, variance16x16_c, 0), - make_tuple(4, 5, variance16x32_c, 0), - make_tuple(5, 4, variance32x16_c, 0), - make_tuple(5, 5, variance32x32_c, 0), - make_tuple(5, 6, variance32x64_c, 0), - make_tuple(6, 5, variance64x32_c, 0), - make_tuple(6, 6, variance64x64_c, 0))); -#if CONFIG_VP9_HIGHBITDEPTH -const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c; -const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c; -const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c; -const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c; -const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c; -const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c; -const vp9_variance_fn_t highbd_10_variance16x16_c = - vp9_highbd_10_variance16x16_c; -const vp9_variance_fn_t highbd_10_variance16x32_c = - vp9_highbd_10_variance16x32_c; -const vp9_variance_fn_t highbd_10_variance32x16_c = - vp9_highbd_10_variance32x16_c; -const vp9_variance_fn_t highbd_10_variance32x32_c = - vp9_highbd_10_variance32x32_c; -const vp9_variance_fn_t highbd_10_variance32x64_c = - vp9_highbd_10_variance32x64_c; -const vp9_variance_fn_t highbd_10_variance64x32_c = - vp9_highbd_10_variance64x32_c; -const vp9_variance_fn_t highbd_10_variance64x64_c = - vp9_highbd_10_variance64x64_c; -const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c; -const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c; -const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c; -const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c; -const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c; -const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c; -const vp9_variance_fn_t highbd_12_variance16x16_c = - vp9_highbd_12_variance16x16_c; -const vp9_variance_fn_t highbd_12_variance16x32_c = - vp9_highbd_12_variance16x32_c; -const vp9_variance_fn_t highbd_12_variance32x16_c = - vp9_highbd_12_variance32x16_c; -const vp9_variance_fn_t highbd_12_variance32x32_c = - vp9_highbd_12_variance32x32_c; -const vp9_variance_fn_t highbd_12_variance32x64_c = - vp9_highbd_12_variance32x64_c; -const vp9_variance_fn_t highbd_12_variance64x32_c = - vp9_highbd_12_variance64x32_c; -const vp9_variance_fn_t highbd_12_variance64x64_c = - vp9_highbd_12_variance64x64_c; -const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c; -const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c; -const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c; -const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c; -const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c; -const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c; -const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c; -const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c; -const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c; -const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c; -const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c; -const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c; -const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9VarianceHighTest, - ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10), - make_tuple(2, 3, highbd_10_variance4x8_c, 10), - make_tuple(3, 2, highbd_10_variance8x4_c, 10), - make_tuple(3, 3, highbd_10_variance8x8_c, 10), - make_tuple(3, 4, highbd_10_variance8x16_c, 10), - make_tuple(4, 3, highbd_10_variance16x8_c, 10), - make_tuple(4, 4, highbd_10_variance16x16_c, 10), - make_tuple(4, 5, highbd_10_variance16x32_c, 10), - make_tuple(5, 4, highbd_10_variance32x16_c, 10), - make_tuple(5, 5, highbd_10_variance32x32_c, 10), - make_tuple(5, 6, highbd_10_variance32x64_c, 10), - make_tuple(6, 5, highbd_10_variance64x32_c, 10), - make_tuple(6, 6, highbd_10_variance64x64_c, 10), - make_tuple(2, 2, highbd_12_variance4x4_c, 12), - make_tuple(2, 3, highbd_12_variance4x8_c, 12), - make_tuple(3, 2, highbd_12_variance8x4_c, 12), - make_tuple(3, 3, highbd_12_variance8x8_c, 12), - make_tuple(3, 4, highbd_12_variance8x16_c, 12), - make_tuple(4, 3, highbd_12_variance16x8_c, 12), - make_tuple(4, 4, highbd_12_variance16x16_c, 12), - make_tuple(4, 5, highbd_12_variance16x32_c, 12), - make_tuple(5, 4, highbd_12_variance32x16_c, 12), - make_tuple(5, 5, highbd_12_variance32x32_c, 12), - make_tuple(5, 6, highbd_12_variance32x64_c, 12), - make_tuple(6, 5, highbd_12_variance64x32_c, 12), - make_tuple(6, 6, highbd_12_variance64x64_c, 12), - make_tuple(2, 2, highbd_variance4x4_c, 8), - make_tuple(2, 3, highbd_variance4x8_c, 8), - make_tuple(3, 2, highbd_variance8x4_c, 8), - make_tuple(3, 3, highbd_variance8x8_c, 8), - make_tuple(3, 4, highbd_variance8x16_c, 8), - make_tuple(4, 3, highbd_variance16x8_c, 8), - make_tuple(4, 4, highbd_variance16x16_c, 8), - make_tuple(4, 5, highbd_variance16x32_c, 8), - make_tuple(5, 4, highbd_variance32x16_c, 8), - make_tuple(5, 5, highbd_variance32x32_c, 8), - make_tuple(5, 6, highbd_variance32x64_c, 8), - make_tuple(6, 5, highbd_variance64x32_c, 8), - make_tuple(6, 6, highbd_variance64x64_c, 8))); -#endif // CONFIG_VP9_HIGHBITDEPTH const vp9_subpixvariance_fn_t subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c; const vp9_subpixvariance_fn_t subpel_variance4x8_c = @@ -1377,40 +1413,11 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8), make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9_ENCODER +#if CONFIG_VP9_ENCODER #if HAVE_SSE2 #if CONFIG_USE_X86INC -INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest, - ::testing::Values(vp9_get_mb_ss_sse2)); - -const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2; -const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2; -const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2; -const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2; -const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2; -const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2; -const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2; -const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2; -const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2; -const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2; -const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2; -const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2; -const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2; -INSTANTIATE_TEST_CASE_P( - SSE2, VP9VarianceTest, - ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0), - make_tuple(2, 3, variance4x8_sse2, 0), - make_tuple(3, 2, variance8x4_sse2, 0), - make_tuple(3, 3, variance8x8_sse2, 0), - make_tuple(3, 4, variance8x16_sse2, 0), - make_tuple(4, 3, variance16x8_sse2, 0), - make_tuple(4, 4, variance16x16_sse2, 0), - make_tuple(4, 5, variance16x32_sse2, 0), - make_tuple(5, 4, variance32x16_sse2, 0), - make_tuple(5, 5, variance32x32_sse2, 0), - make_tuple(5, 6, variance32x64_sse2, 0), - make_tuple(6, 5, variance64x32_sse2, 0), - make_tuple(6, 6, variance64x64_sse2, 0))); const vp9_subpixvariance_fn_t subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse; const vp9_subpixvariance_fn_t subpel_variance4x8_sse = @@ -1494,96 +1501,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0), make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0))); #if CONFIG_VP9_HIGHBITDEPTH -const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2; -const vp9_variance_fn_t highbd_10_variance8x8_sse2 = - vp9_highbd_10_variance8x8_sse2; -const vp9_variance_fn_t highbd_12_variance8x8_sse2 = - vp9_highbd_12_variance8x8_sse2; -const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2; -const vp9_variance_fn_t highbd_10_variance8x16_sse2 = - vp9_highbd_10_variance8x16_sse2; -const vp9_variance_fn_t highbd_12_variance8x16_sse2 = - vp9_highbd_12_variance8x16_sse2; -const vp9_variance_fn_t highbd_variance16x8_sse2 = - vp9_highbd_variance16x8_sse2; -const vp9_variance_fn_t highbd_10_variance16x8_sse2 = - vp9_highbd_10_variance16x8_sse2; -const vp9_variance_fn_t highbd_12_variance16x8_sse2 = - vp9_highbd_12_variance16x8_sse2; -const vp9_variance_fn_t highbd_variance16x16_sse2 = - vp9_highbd_variance16x16_sse2; -const vp9_variance_fn_t highbd_10_variance16x16_sse2 = - vp9_highbd_10_variance16x16_sse2; -const vp9_variance_fn_t highbd_12_variance16x16_sse2 = - vp9_highbd_12_variance16x16_sse2; -const vp9_variance_fn_t highbd_variance16x32_sse2 = - vp9_highbd_variance16x32_sse2; -const vp9_variance_fn_t highbd_10_variance16x32_sse2 = - vp9_highbd_10_variance16x32_sse2; -const vp9_variance_fn_t highbd_12_variance16x32_sse2 = - vp9_highbd_12_variance16x32_sse2; -const vp9_variance_fn_t highbd_variance32x16_sse2 = - vp9_highbd_variance32x16_sse2; -const vp9_variance_fn_t highbd_10_variance32x16_sse2 = - vp9_highbd_10_variance32x16_sse2; -const vp9_variance_fn_t highbd_12_variance32x16_sse2 = - vp9_highbd_12_variance32x16_sse2; -const vp9_variance_fn_t highbd_variance32x32_sse2 = - vp9_highbd_variance32x32_sse2; -const vp9_variance_fn_t highbd_10_variance32x32_sse2 = - vp9_highbd_10_variance32x32_sse2; -const vp9_variance_fn_t highbd_12_variance32x32_sse2 = - vp9_highbd_12_variance32x32_sse2; -const vp9_variance_fn_t highbd_variance32x64_sse2 = - vp9_highbd_variance32x64_sse2; -const vp9_variance_fn_t highbd_10_variance32x64_sse2 = - vp9_highbd_10_variance32x64_sse2; -const vp9_variance_fn_t highbd_12_variance32x64_sse2 = - vp9_highbd_12_variance32x64_sse2; -const vp9_variance_fn_t highbd_variance64x32_sse2 = - vp9_highbd_variance64x32_sse2; -const vp9_variance_fn_t highbd_10_variance64x32_sse2 = - vp9_highbd_10_variance64x32_sse2; -const vp9_variance_fn_t highbd_12_variance64x32_sse2 = - vp9_highbd_12_variance64x32_sse2; -const vp9_variance_fn_t highbd_variance64x64_sse2 = - vp9_highbd_variance64x64_sse2; -const vp9_variance_fn_t highbd_10_variance64x64_sse2 = - vp9_highbd_10_variance64x64_sse2; -const vp9_variance_fn_t highbd_12_variance64x64_sse2 = - vp9_highbd_12_variance64x64_sse2; -INSTANTIATE_TEST_CASE_P( - SSE2, VP9VarianceHighTest, - ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10), - make_tuple(3, 4, highbd_10_variance8x16_sse2, 10), - make_tuple(4, 3, highbd_10_variance16x8_sse2, 10), - make_tuple(4, 4, highbd_10_variance16x16_sse2, 10), - make_tuple(4, 5, highbd_10_variance16x32_sse2, 10), - make_tuple(5, 4, highbd_10_variance32x16_sse2, 10), - make_tuple(5, 5, highbd_10_variance32x32_sse2, 10), - make_tuple(5, 6, highbd_10_variance32x64_sse2, 10), - make_tuple(6, 5, highbd_10_variance64x32_sse2, 10), - make_tuple(6, 6, highbd_10_variance64x64_sse2, 10), - make_tuple(3, 3, highbd_12_variance8x8_sse2, 12), - make_tuple(3, 4, highbd_12_variance8x16_sse2, 12), - make_tuple(4, 3, highbd_12_variance16x8_sse2, 12), - make_tuple(4, 4, highbd_12_variance16x16_sse2, 12), - make_tuple(4, 5, highbd_12_variance16x32_sse2, 12), - make_tuple(5, 4, highbd_12_variance32x16_sse2, 12), - make_tuple(5, 5, highbd_12_variance32x32_sse2, 12), - make_tuple(5, 6, highbd_12_variance32x64_sse2, 12), - make_tuple(6, 5, highbd_12_variance64x32_sse2, 12), - make_tuple(6, 6, highbd_12_variance64x64_sse2, 12), - make_tuple(3, 3, highbd_variance8x8_sse2, 8), - make_tuple(3, 4, highbd_variance8x16_sse2, 8), - make_tuple(4, 3, highbd_variance16x8_sse2, 8), - make_tuple(4, 4, highbd_variance16x16_sse2, 8), - make_tuple(4, 5, highbd_variance16x32_sse2, 8), - make_tuple(5, 4, highbd_variance32x16_sse2, 8), - make_tuple(5, 5, highbd_variance32x32_sse2, 8), - make_tuple(5, 6, highbd_variance32x64_sse2, 8), - make_tuple(6, 5, highbd_variance64x32_sse2, 8), - make_tuple(6, 6, highbd_variance64x64_sse2, 8))); const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 = vp9_highbd_sub_pixel_variance8x4_sse2; const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 = @@ -1790,6 +1707,9 @@ INSTANTIATE_TEST_CASE_P( #endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_USE_X86INC #endif // HAVE_SSE2 +#endif // CONFIG_VP9_ENCODER + +#if CONFIG_VP9_ENCODER #if HAVE_SSSE3 #if CONFIG_USE_X86INC @@ -1877,22 +1797,27 @@ INSTANTIATE_TEST_CASE_P( make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0))); #endif // CONFIG_USE_X86INC #endif // HAVE_SSSE3 +#endif // CONFIG_VP9_ENCODER #if HAVE_AVX2 - -const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2; -const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2; -const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2; -const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2; -const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2; +const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2; +INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_avx2))); + +const VarianceMxNFunc variance64x64_avx2 = vpx_variance64x64_avx2; +const VarianceMxNFunc variance64x32_avx2 = vpx_variance64x32_avx2; +const VarianceMxNFunc variance32x32_avx2 = vpx_variance32x32_avx2; +const VarianceMxNFunc variance32x16_avx2 = vpx_variance32x16_avx2; +const VarianceMxNFunc variance16x16_avx2 = vpx_variance16x16_avx2; INSTANTIATE_TEST_CASE_P( - AVX2, VP9VarianceTest, - ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0), - make_tuple(5, 4, variance32x16_avx2, 0), - make_tuple(5, 5, variance32x32_avx2, 0), + AVX2, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_avx2, 0), make_tuple(6, 5, variance64x32_avx2, 0), - make_tuple(6, 6, variance64x64_avx2, 0))); + make_tuple(5, 5, variance32x32_avx2, 0), + make_tuple(5, 4, variance32x16_avx2, 0), + make_tuple(4, 4, variance16x16_avx2, 0))); +#if CONFIG_VP9_ENCODER const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 = vp9_sub_pixel_variance32x32_avx2; const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 = @@ -1910,23 +1835,38 @@ INSTANTIATE_TEST_CASE_P( AVX2, VP9SubpelAvgVarianceTest, ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0), make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0))); +#endif // CONFIG_VP9_ENCODER #endif // HAVE_AVX2 + #if HAVE_NEON -const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon; -const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; -const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon; -const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon; -const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon; -const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon; +const Get4x4SSEFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon; +INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, + ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon))); + +const VarianceMxNFunc mse16x16_neon = vpx_mse16x16_neon; +INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_neon))); + +const VarianceMxNFunc variance64x64_neon = vpx_variance64x64_neon; +const VarianceMxNFunc variance64x32_neon = vpx_variance64x32_neon; +const VarianceMxNFunc variance32x64_neon = vpx_variance32x64_neon; +const VarianceMxNFunc variance32x32_neon = vpx_variance32x32_neon; +const VarianceMxNFunc variance16x16_neon = vpx_variance16x16_neon; +const VarianceMxNFunc variance16x8_neon = vpx_variance16x8_neon; +const VarianceMxNFunc variance8x16_neon = vpx_variance8x16_neon; +const VarianceMxNFunc variance8x8_neon = vpx_variance8x8_neon; INSTANTIATE_TEST_CASE_P( - NEON, VP9VarianceTest, - ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0), - make_tuple(4, 4, variance16x16_neon, 0), - make_tuple(5, 5, variance32x32_neon, 0), - make_tuple(5, 6, variance32x64_neon, 0), + NEON, VpxVarianceTest, + ::testing::Values(make_tuple(6, 6, variance64x64_neon, 0), make_tuple(6, 5, variance64x32_neon, 0), - make_tuple(6, 6, variance64x64_neon, 0))); + make_tuple(5, 6, variance32x64_neon, 0), + make_tuple(5, 5, variance32x32_neon, 0), + make_tuple(4, 4, variance16x16_neon, 0), + make_tuple(4, 4, variance16x8_neon, 0), + make_tuple(4, 4, variance8x16_neon, 0), + make_tuple(3, 3, variance8x8_neon, 0))); +#if CONFIG_VP9_ENCODER const vp9_subpixvariance_fn_t subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; const vp9_subpixvariance_fn_t subpel_variance16x16_neon = @@ -1941,8 +1881,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 4, subpel_variance16x16_neon, 0), make_tuple(5, 5, subpel_variance32x32_neon, 0), make_tuple(6, 6, subpel_variance64x64_neon, 0))); -#endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER +#endif // HAVE_NEON -} // namespace vp9 +#if HAVE_MEDIA +const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media; +INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_media))); + +const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media; +const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media; +INSTANTIATE_TEST_CASE_P( + MEDIA, VpxVarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_media, 0), + make_tuple(3, 3, variance8x8_media, 0))); +#endif // HAVE_MEDIA } // namespace diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm deleted file mode 100644 index 3991957..0000000 --- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm deleted file mode 100644 index 915ee49..0000000 --- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm +++ /dev/null @@ -1,101 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance8x8_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_armv6| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - - END diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c deleted file mode 100644 index 1b19790..0000000 --- a/vp8/common/arm/neon/variance_neon.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "vpx_ports/mem.h" - -unsigned int vp8_variance16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance16x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { // variance16x8_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d2u8, d4u8, d6u8; - int16x4_t d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // variance8x16_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - __builtin_prefetch(src_ptr); - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - __builtin_prefetch(ref_ptr); - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d2u8, d6u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance8x8_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 2; i++) { // variance8x8_neon_loop - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); - d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c index 467a509..0f293f0 100644 --- a/vp8/common/arm/variance_arm.c +++ b/vp8/common/arm/variance_arm.c @@ -9,10 +9,14 @@ */ #include "vpx_config.h" -#include "vp8_rtcd.h" +#include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp8/common/variance.h" #include "vp8/common/filter.h" +// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder +#if CONFIG_VP8_ENCODER + #if HAVE_MEDIA #include "vp8/common/arm/bilinearfilter_arm.h" @@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 8, 8, 8, VFilter); - return vp8_variance8x8_armv6(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); + return vpx_variance8x8_media(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); } unsigned int vp8_sub_pixel_variance16x16_armv6 @@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6 vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, 16, 16, 16, VFilter); - var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); } return var; } -#endif /* HAVE_MEDIA */ +#endif // HAVE_MEDIA #if HAVE_NEON @@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); } -#endif +#endif // HAVE_NEON +#endif // CONFIG_VP8_ENCODER diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c index d12dea1..5c0680f 100644 --- a/vp8/common/mfqe.c +++ b/vp8/common/mfqe.c @@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block if (blksize == 16) { - actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; - act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; + actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; #ifdef USE_SSD - vp8_variance16x16(y, y_stride, yd, yd_stride, &sse); + vpx_variance16x16(y, y_stride, yd, yd_stride, &sse); sad = (sse + 128)>>8; - vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 32)>>6; - vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 32)>>6; #else sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; @@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block } else /* if (blksize == 8) */ { - actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; - act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; + actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; #ifdef USE_SSD - vp8_variance8x8(y, y_stride, yd, yd_stride, &sse); + vpx_variance8x8(y, y_stride, yd, yd_stride, &sse); sad = (sse + 32)>>6; - vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse); + vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 8)>>4; - vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse); + vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 8)>>4; #else sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6; diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index c9f14d5..4b82033 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -237,31 +237,6 @@ specialize qw/vp8_bilinear_predict4x4 mmx media neon/; $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; # -# Whole-pixel Variance -# -add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance4x4 mmx sse2/; -$vp8_variance4x4_sse2=vp8_variance4x4_wmt; - -add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x8 mmx sse2 media neon/; -$vp8_variance8x8_sse2=vp8_variance8x8_wmt; -$vp8_variance8x8_media=vp8_variance8x8_armv6; - -add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance8x16 mmx sse2 neon/; -$vp8_variance8x16_sse2=vp8_variance8x16_wmt; - -add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x8 mmx sse2 neon/; -$vp8_variance16x8_sse2=vp8_variance16x8_wmt; - -add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance16x16 mmx sse2 media neon/; -$vp8_variance16x16_sse2=vp8_variance16x16_wmt; -$vp8_variance16x16_media=vp8_variance16x16_armv6; - -# # Sub-pixel Variance # add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; @@ -309,26 +284,12 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") { # -# Sum of squares (vector) -# -add_proto qw/unsigned int vp8_get_mb_ss/, "const short *"; -specialize qw/vp8_get_mb_ss mmx sse2/; - -# # SSE (Sum Squared Error) # add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/; $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt; -add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_mse16x16 mmx sse2 media neon/; -$vp8_mse16x16_sse2=vp8_mse16x16_wmt; -$vp8_mse16x16_media=vp8_mse16x16_armv6; - -add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/vp8_get4x4sse_cs mmx neon/; - # # Block copy # diff --git a/vp8/common/variance.h b/vp8/common/variance.h index b62cc61..c6c9f41 100644 --- a/vp8/common/variance.h +++ b/vp8/common/variance.h @@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)( const unsigned char *ref_array, int ref_stride, unsigned int *sad_array); + typedef void (*vpx_sad_multi_d_fn_t) ( const unsigned char *src_ptr, @@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t) unsigned int *sad_array ); -typedef unsigned int (*vp8_variance_fn_t) +typedef unsigned int (*vpx_variance_fn_t) ( const unsigned char *src_ptr, int source_stride, @@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t) unsigned int *sse ); -typedef void (*vp8_ssimpf_fn_t) - ( - unsigned char *s, - int sp, - unsigned char *r, - int rp, - unsigned long *sum_s, - unsigned long *sum_r, - unsigned long *sum_sq_s, - unsigned long *sum_sq_r, - unsigned long *sum_sxr - ); - -typedef unsigned int (*vp8_getmbss_fn_t)(const short *); - -typedef unsigned int (*vp8_get16x16prederror_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int ref_stride - ); - typedef struct variance_vtable { vpx_sad_fn_t sdf; - vp8_variance_fn_t vf; + vpx_variance_fn_t vf; vp8_subpixvariance_fn_t svf; - vp8_variance_fn_t svf_halfpix_h; - vp8_variance_fn_t svf_halfpix_v; - vp8_variance_fn_t svf_halfpix_hv; + vpx_variance_fn_t svf_halfpix_h; + vpx_variance_fn_t svf_halfpix_v; + vpx_variance_fn_t svf_halfpix_hv; vpx_sad_multi_fn_t sdx3f; vpx_sad_multi_fn_t sdx8f; vpx_sad_multi_d_fn_t sdx4df; diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c index dc95bfe..79d1ca0 100644 --- a/vp8/common/variance_c.c +++ b/vp8/common/variance_c.c @@ -8,44 +8,34 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vp8_rtcd.h" #include "filter.h" #include "variance.h" - -unsigned int vp8_get_mb_ss_c -( - const short *src_ptr -) -{ - unsigned int i = 0, sum = 0; - - do - { - sum += (src_ptr[i] * src_ptr[i]); - i++; - } - while (i < 256); - - return sum; +/* This is a bad idea. + * ctz = count trailing zeros */ +static int ctz(int a) { + int b = 0; + while (a != 1) { + a >>= 1; + b++; + } + return b; } - -static void variance( +static unsigned int variance( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, int w, int h, - unsigned int *sse, - int *sum) + unsigned int *sse) { int i, j; - int diff; + int diff, sum; - *sum = 0; + sum = 0; *sse = 0; for (i = 0; i < h; i++) @@ -53,114 +43,17 @@ static void variance( for (j = 0; j < w; j++) { diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; + sum += diff; *sse += diff * diff; } src_ptr += source_stride; ref_ptr += recon_stride; } -} - -unsigned int vp8_variance16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance8x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); + return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h))))); } -unsigned int vp8_variance16x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - - -unsigned int vp8_variance8x8_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp8_variance4x4_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - - -unsigned int vp8_mse16x16_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return var; -} - - /**************************************************************************** * * ROUTINE : filter_block2d_bil_first_pass @@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c /* Now filter Verticaly */ var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); - return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse); } @@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); - return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse); } unsigned int vp8_sub_pixel_variance16x16_c @@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); - return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse); } @@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); - return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse); } unsigned int vp8_sub_pixel_variance8x16_c @@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); - return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); + return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse); } diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm index 7d5e681..97f2527 100644 --- a/vp8/common/x86/variance_impl_mmx.asm +++ b/vp8/common/x86/variance_impl_mmx.asm @@ -11,504 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) -global sym(vp8_get_mb_ss_mmx) PRIVATE -sym(vp8_get_mb_ss_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 8 - ; end prolog - - mov rax, arg(0) ;src_ptr - mov rcx, 16 - pxor mm4, mm4 - -.NEXTROW: - movq mm0, [rax] - movq mm1, [rax+8] - movq mm2, [rax+16] - movq mm3, [rax+24] - pmaddwd mm0, mm0 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - - paddd mm4, mm0 - paddd mm4, mm1 - paddd mm4, mm2 - paddd mm4, mm3 - - add rax, 32 - dec rcx - ja .NEXTROW - movq QWORD PTR [rsp], mm4 - - ;return sum[0]+sum[1]; - movsxd rax, dword ptr [rsp] - movsxd rcx, dword ptr [rsp+4] - add rax, rcx - - - ; begin epilog - add rsp, 8 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get8x8var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp8_get8x8var_mmx) PRIVATE -sym(vp8_get8x8var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - - ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 5 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - ; movq mm4, [rbx + rdx] - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 6 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 7 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 8 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp8_get4x4var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp8_get4x4var_mmx) PRIVATE -sym(vp8_get4x4var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movd mm0, [rax] ; Copy four bytes to mm0 - movd mm1, [rbx] ; Copy four bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - - ; Row 2 - movd mm0, [rax] ; Copy four bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy four bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher precision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy four bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy four bytes to mm0 - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp8_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp8_get4x4sse_cs_mmx) PRIVATE -sym(vp8_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - %define mmx_filter_shift 7 ;void vp8_filter_block2d_bil4x4_var_mmx diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm index 761433c..26de5e8 100644 --- a/vp8/common/x86/variance_impl_sse2.asm +++ b/vp8/common/x86/variance_impl_sse2.asm @@ -13,393 +13,6 @@ %define xmm_filter_shift 7 -;unsigned int vp8_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp8_get_mb_ss_sse2) PRIVATE -sym(vp8_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_get16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get16x16var_sse2) PRIVATE -sym(vp8_get16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - ; Prefetch data - lea rcx, [rax+rax*2] - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax*2] - prefetcht0 [rsi+rcx] - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax*2] - prefetcht0 [rbx+rcx] - - lea rcx, [rdx+rdx*2] - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx*2] - prefetcht0 [rdi+rcx] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx*2] - prefetcht0 [rbx+rcx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - prefetcht0 [rsi+rax*8] - prefetcht0 [rdi+rdx*8] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz .var16loop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movd DWORD PTR [rax], xmm7 - movd DWORD PTR [rdi], xmm1 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - - -;unsigned int vp8_get8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp8_get8x8var_sse2) PRIVATE -sym(vp8_get8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - movq xmm1, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rdi] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - psubsw xmm1, xmm2 - paddw xmm7, xmm1 - - pmaddwd xmm1, xmm1 - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movq xmm2, QWORD PTR[rsi + rax * 2] - movq xmm3, QWORD PTR[rdi + rdx * 2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - - punpckhwd xmm7, xmm0 - movdqa xmm2, xmm1 - - paddw xmm6, xmm7 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddw xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddw xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movq rdx, xmm7 - movsx rcx, dx - - mov dword ptr [rax], ecx - movd DWORD PTR [rdi], xmm1 - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void vp8_filter_block2d_bil_var_sse2 ;( ; unsigned char *ref_ptr, diff --git a/vp8/common/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c index 10a58b8..25ae576 100644 --- a/vp8/common/x86/variance_mmx.c +++ b/vp8/common/x86/variance_mmx.c @@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx short *filter ); -extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr); -extern unsigned int vp8_get8x8var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_filter_block2d_bil4x4_var_mmx ( const unsigned char *ref_ptr, @@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx unsigned int *sumsquared ); - -unsigned int vp8_variance4x4_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 6)); - -} - -unsigned int vp8_mse16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - *sse = var; - return var; -} - - -unsigned int vp8_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp8_variance16x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - -unsigned int vp8_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 7)); - -} - - unsigned int vp8_sub_pixel_variance4x4_mmx ( const unsigned char *src_ptr, diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c index 6c6539d..f6dfb27 100644 --- a/vp8/common/x86/variance_sse2.c +++ b/vp8/common/x86/variance_sse2.c @@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx unsigned int *sumsquared ); -extern unsigned int vp8_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp8_get_mb_ss_sse2 -( - const short *src_ptr -); -unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -unsigned int vp8_get8x8var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); void vp8_filter_block2d_bil_var_sse2 ( const unsigned char *ref_ptr, @@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -unsigned int vp8_variance4x4_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); - -} - -unsigned int vp8_variance8x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int var; - int avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); - -} - - -unsigned int vp8_variance16x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0; - int sum0; - - - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); -} -unsigned int vp8_mse16x16_wmt( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - - unsigned int sse0; - int sum0; - vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - *sse = sse0; - return sse0; - -} - - -unsigned int vp8_variance16x8_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - -unsigned int vp8_variance8x16_wmt -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; - vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); - -} - unsigned int vp8_sub_pixel_variance4x4_wmt ( const unsigned char *src_ptr, diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c index d8c8da5..2a0df64 100644 --- a/vp8/common/x86/variance_ssse3.c +++ b/vp8/common/x86/variance_ssse3.c @@ -13,15 +13,6 @@ #include "vp8/common/variance.h" #include "vpx_ports/mem.h" -extern unsigned int vp8_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); extern void vp8_half_horiz_vert_variance16x_h_sse2 ( const unsigned char *ref_ptr, diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm deleted file mode 100644 index 000805d..0000000 --- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vp8_mse16x16_armv6| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c deleted file mode 100644 index f806809..0000000 --- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -unsigned int vp8_mse16x16_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64x1_t d0s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - q7s32 = vdupq_n_s32(0); - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 8; i++) { // mse16x16_neon_loop - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); - q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); - q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); - q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q7s32 = vmlal_s16(q7s32, d22s16, d22s16); - q8s32 = vmlal_s16(q8s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q7s32 = vmlal_s16(q7s32, d26s16, d26s16); - q8s32 = vmlal_s16(q8s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q10s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q10s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} - -unsigned int vp8_get4x4sse_cs_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride) { - int16x4_t d22s16, d24s16, d26s16, d28s16; - int64x1_t d0s64; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int32x4_t q7s32, q8s32, q9s32, q10s32; - uint16x8_t q11u16, q12u16, q13u16, q14u16; - int64x2_t q1s64; - - d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; - d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; - - q11u16 = vsubl_u8(d0u8, d4u8); - q12u16 = vsubl_u8(d1u8, d5u8); - q13u16 = vsubl_u8(d2u8, d6u8); - q14u16 = vsubl_u8(d3u8, d7u8); - - d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); - d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); - d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); - d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); - - q7s32 = vmull_s16(d22s16, d22s16); - q8s32 = vmull_s16(d24s16, d24s16); - q9s32 = vmull_s16(d26s16, d26s16); - q10s32 = vmull_s16(d28s16, d28s16); - - q7s32 = vaddq_s32(q7s32, q8s32); - q9s32 = vaddq_s32(q9s32, q10s32); - q9s32 = vaddq_s32(q7s32, q9s32); - - q1s64 = vpaddlq_s32(q9s32); - d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); - - return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); -} diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 378e902..d381d8d 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "encodemb.h" #include "encodemv.h" #include "vp8/common/common.h" @@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x ) * lambda using a non-linear combination (e.g., the smallest, or second * smallest, etc.). */ - act = vp8_variance16x16(x->src.y_buffer, + act = vpx_variance16x16(x->src.y_buffer, x->src.y_stride, VP8_VAR_OFFS, 0, &sse); act = act<<4; diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index cfa4cb9..e2de5ee 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "quantize.h" #include "vp8/common/reconintra4x4.h" #include "encodemb.h" @@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) } } - intra_pred_var = vp8_get_mb_ss(x->src_diff); + intra_pred_var = vpx_get_mb_ss(x->src_diff); return intra_pred_var; } diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index a6ff0e7..3deb4ab 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -12,6 +12,7 @@ #include #include +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "block.h" #include "onyx_int.h" @@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, /* Set up pointers for this macro block raw buffer */ raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset + d->offset); - vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride, - (unsigned int *)(raw_motion_err)); + vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride, + (unsigned int *)(raw_motion_err)); /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset ); - vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride, - (unsigned int *)(best_motion_err)); + vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride, + (unsigned int *)(best_motion_err)); } static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, @@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, int new_mv_mode_penalty = 256; /* override the default variance function to use MSE */ - v_fn_ptr.vf = vp8_mse16x16; + v_fn_ptr.vf = vpx_mse16x16; /* Set up pointers for this macro block recon buffer */ xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index c2bb232..40e29e1 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) #endif cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; - cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16; + cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h; cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v; @@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; - cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8; + cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; @@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d; cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; - cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16; + cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; @@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d; cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; - cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8; + cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; @@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d; cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; - cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4; + cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; @@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride, { unsigned int sse; - vp8_mse16x16(orig + col, orig_stride, + vpx_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse); total_sse += sse; @@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source, int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - Total += vp8_mse16x16(src + j, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); @@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { int index = block_index_row + (j >> 4); if (cpi->consec_zero_last[index] >= min_consec_zero_last) { unsigned int sse; - const unsigned int var = vp8_variance16x16(src + j, + const unsigned int var = vpx_variance16x16(src + j, ystride, dst + j, ystride, @@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) { // is small (to avoid effects from lighting change). if ((sse - var) < 128) { unsigned int sse2; - const unsigned int act = vp8_variance16x16(src + j, + const unsigned int act = vpx_variance16x16(src + j, ystride, const_source, 0, @@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + Total += vpx_mse16x16(src + j, source->y_stride, + dst + j, dest->y_stride, &sse); } src += 16 * source->y_stride; diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 98ea5a0..053bf11 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -11,6 +11,7 @@ #include #include "vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" @@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb, } - -unsigned int vp8_get4x4sse_cs_c -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride -) -{ - int distortion = 0; - int r, c; - - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - int diff = src_ptr[c] - ref_ptr[c]; - distortion += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - return distortion; -} - static int get_prediction_error(BLOCK *be, BLOCKD *b) { unsigned char *sptr; @@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) sptr = (*(be->base_src) + be->src); dptr = b->predictor; - return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16); + return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16); } @@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, else { rate2 += rate; - distortion2 = vp8_variance16x16( + distortion2 = vpx_variance16x16( *(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); @@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, xd->dst.y_stride, xd->predictor, 16); - distortion2 = vp8_variance16x16 + distortion2 = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; @@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) xd->dst.y_stride, xd->predictor, 16); - distortion = vp8_variance16x16 + distortion = vpx_variance16x16 (*(b->base_src), b->src_stride, xd->predictor, 16, &sse); rate = x->mbmode_cost[xd->frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c index 890053d..875b37f 100644 --- a/vp8/encoder/picklpf.c +++ b/vp8/encoder/picklpf.c @@ -9,6 +9,7 @@ */ +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vp8/common/onyxc_int.h" #include "onyx_int.h" @@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += vp8_mse16x16(src + j, source->y_stride, + Total += vpx_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse); } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 9ccd85e..17194f0 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -15,6 +15,7 @@ #include #include "vpx_config.h" #include "vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" @@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x) } else { - vp8_variance8x8(uptr, pre_stride, + vpx_variance8x8(uptr, pre_stride, upred_ptr, uv_stride, &sse2); - vp8_variance8x8(vptr, pre_stride, + vpx_variance8x8(vptr, pre_stride, vpred_ptr, uv_stride, &sse1); sse2 += sse1; } @@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], if(threshold < x->encode_breakout) threshold = x->encode_breakout; - var = vp8_variance16x16 + var = vpx_variance16x16 (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index b4c8140..c71d592 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) @@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 0500301..0b0f6a7 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c #File list for media # encoder VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) -VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon @@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c index 57189df..bebb37e 100644 --- a/vp9/common/vp9_mfqe.c +++ b/vp9/common/vp9_mfqe.c @@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u, get_thr(bs, qdiff, &sad_thr, &vdiff_thr); if (bs == BLOCK_16X16) { - vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; + vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8; sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8; } else if (bs == BLOCK_32X32) { - vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; + vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10; sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10; } else /* if (bs == BLOCK_64X64) */ { - vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; + vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12; sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12; } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2a9736b..30710ba 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { # variance -add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x32/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get8x8var neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance8x4/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance4x4/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; -add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x16/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse16x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/vp9_mse8x8/, "$sse2_x86inc"; - -add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *"; -specialize qw/vp9_get_mb_ss/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_avg_8x8 sse2 neon/; @@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - # variance - add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance4x4/; - - add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance4x4/; - - add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x4/; - - add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance4x4/; - - add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc"; - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc"; @@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/; - add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x16/; - - add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x8/; - - add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc"; # ENCODEMB INVOKE diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c index cf82dd7..166156a 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -10,6 +10,7 @@ #include #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_ports/mem.h" @@ -20,82 +21,6 @@ #include "vp9/encoder/vp9_variance.h" -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -// w * h must be less than 2048 or local variable v_sum may overflow. -static void variance_neon_w8(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, uint32_t *sse, int *sum) { - int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = vmlal_s16(v_sse_lo, - vget_low_s16(sv_diff), - vget_low_s16(sv_diff)); - v_sse_hi = vmlal_s16(v_sse_hi, - vget_high_s16(sv_diff), - vget_high_s16(sv_diff)); - } - a += a_stride; - b += b_stride; - } - - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); -} - -void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8, - 8, sse, sum); -} - -unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 -} - -void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16, - 16, sse, sum); -} - -unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 -} - static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse); + return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, @@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse); -} - -void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32, - 32, sse, sum); -} - -unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); - return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 -} - -unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w8(a + (32 * a_stride), a_stride, - b + (32 * b_stride), b_stride, 32, 32, - &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 -} - -unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 -} - -unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum1, sum2; - uint32_t sse1, sse2; - - variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w8(a + (16 * a_stride), a_stride, - b + (16 * b_stride), b_stride, 64, 16, - &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 2 * a_stride), a_stride, - b + (16 * 2 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - sse1 += sse2; - sum1 += sum2; - - variance_neon_w8(a + (16 * 3 * a_stride), a_stride, - b + (16 * 3 * b_stride), b_stride, - 64, 16, &sse2, &sum2); - *sse = sse1 + sse2; - sum1 += sum2; - return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 + return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, @@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse); + return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, @@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, BILINEAR_FILTERS_2TAP(xoffset)); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, BILINEAR_FILTERS_2TAP(yoffset)); - return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse); + return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c index 55c9649..9e5d9ee 100644 --- a/vp9/encoder/vp9_aq_variance.c +++ b/vp9/encoder/vp9_aq_variance.c @@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, int avg; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride, - CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, - &sse, &avg); + highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride, + CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh, + &sse, &avg); sse >>= 2 * (xd->bd - 8); avg >>= (xd->bd - 8); } else { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3d310f9..812adb0 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -13,6 +13,7 @@ #include #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_ports/mem.h" @@ -3672,15 +3673,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { if (cm->use_highbitdepth) { switch (cm->bit_depth) { case VPX_BITS_8: - vp9_highbd_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; case VPX_BITS_10: - vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; case VPX_BITS_12: - vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride, + vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); break; default: @@ -3689,11 +3690,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { return -1; } } else { - vp9_get16x16var(src, src_stride, last_src, last_stride, + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); } #else - vp9_get16x16var(src, src_stride, last_src, last_stride, + vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); #endif // CONFIG_VP9_HIGHBITDEPTH var16->var = var16->sse - diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e89ee5e..f0e3fb4 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -998,7 +998,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8, vpx_highbd_sad32x16_avg_bits8, - vp9_highbd_variance32x16, + vpx_highbd_8_variance32x16, vp9_highbd_sub_pixel_variance32x16, vp9_highbd_sub_pixel_avg_variance32x16, NULL, @@ -1008,7 +1008,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8, vpx_highbd_sad16x32_avg_bits8, - vp9_highbd_variance16x32, + vpx_highbd_8_variance16x32, vp9_highbd_sub_pixel_variance16x32, vp9_highbd_sub_pixel_avg_variance16x32, NULL, @@ -1018,7 +1018,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8, vpx_highbd_sad64x32_avg_bits8, - vp9_highbd_variance64x32, + vpx_highbd_8_variance64x32, vp9_highbd_sub_pixel_variance64x32, vp9_highbd_sub_pixel_avg_variance64x32, NULL, @@ -1028,7 +1028,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8, vpx_highbd_sad32x64_avg_bits8, - vp9_highbd_variance32x64, + vpx_highbd_8_variance32x64, vp9_highbd_sub_pixel_variance32x64, vp9_highbd_sub_pixel_avg_variance32x64, NULL, @@ -1038,7 +1038,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8, vpx_highbd_sad32x32_avg_bits8, - vp9_highbd_variance32x32, + vpx_highbd_8_variance32x32, vp9_highbd_sub_pixel_variance32x32, vp9_highbd_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits8, @@ -1048,7 +1048,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8, vpx_highbd_sad64x64_avg_bits8, - vp9_highbd_variance64x64, + vpx_highbd_8_variance64x64, vp9_highbd_sub_pixel_variance64x64, vp9_highbd_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits8, @@ -1058,7 +1058,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8, vpx_highbd_sad16x16_avg_bits8, - vp9_highbd_variance16x16, + vpx_highbd_8_variance16x16, vp9_highbd_sub_pixel_variance16x16, vp9_highbd_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits8, @@ -1068,7 +1068,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, - vp9_highbd_variance16x8, + vpx_highbd_8_variance16x8, vp9_highbd_sub_pixel_variance16x8, vp9_highbd_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, @@ -1078,7 +1078,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, - vp9_highbd_variance8x16, + vpx_highbd_8_variance8x16, vp9_highbd_sub_pixel_variance8x16, vp9_highbd_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, @@ -1088,7 +1088,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, - vp9_highbd_variance8x8, + vpx_highbd_8_variance8x8, vp9_highbd_sub_pixel_variance8x8, vp9_highbd_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, @@ -1098,7 +1098,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, - vp9_highbd_variance8x4, + vpx_highbd_8_variance8x4, vp9_highbd_sub_pixel_variance8x4, vp9_highbd_sub_pixel_avg_variance8x4, NULL, @@ -1108,7 +1108,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, - vp9_highbd_variance4x8, + vpx_highbd_8_variance4x8, vp9_highbd_sub_pixel_variance4x8, vp9_highbd_sub_pixel_avg_variance4x8, NULL, @@ -1118,7 +1118,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, - vp9_highbd_variance4x4, + vpx_highbd_8_variance4x4, vp9_highbd_sub_pixel_variance4x4, vp9_highbd_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, @@ -1130,7 +1130,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10, vpx_highbd_sad32x16_avg_bits10, - vp9_highbd_10_variance32x16, + vpx_highbd_10_variance32x16, vp9_highbd_10_sub_pixel_variance32x16, vp9_highbd_10_sub_pixel_avg_variance32x16, NULL, @@ -1140,7 +1140,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10, vpx_highbd_sad16x32_avg_bits10, - vp9_highbd_10_variance16x32, + vpx_highbd_10_variance16x32, vp9_highbd_10_sub_pixel_variance16x32, vp9_highbd_10_sub_pixel_avg_variance16x32, NULL, @@ -1150,7 +1150,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10, vpx_highbd_sad64x32_avg_bits10, - vp9_highbd_10_variance64x32, + vpx_highbd_10_variance64x32, vp9_highbd_10_sub_pixel_variance64x32, vp9_highbd_10_sub_pixel_avg_variance64x32, NULL, @@ -1160,7 +1160,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10, vpx_highbd_sad32x64_avg_bits10, - vp9_highbd_10_variance32x64, + vpx_highbd_10_variance32x64, vp9_highbd_10_sub_pixel_variance32x64, vp9_highbd_10_sub_pixel_avg_variance32x64, NULL, @@ -1170,7 +1170,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10, vpx_highbd_sad32x32_avg_bits10, - vp9_highbd_10_variance32x32, + vpx_highbd_10_variance32x32, vp9_highbd_10_sub_pixel_variance32x32, vp9_highbd_10_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits10, @@ -1180,7 +1180,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10, vpx_highbd_sad64x64_avg_bits10, - vp9_highbd_10_variance64x64, + vpx_highbd_10_variance64x64, vp9_highbd_10_sub_pixel_variance64x64, vp9_highbd_10_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits10, @@ -1190,7 +1190,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10, vpx_highbd_sad16x16_avg_bits10, - vp9_highbd_10_variance16x16, + vpx_highbd_10_variance16x16, vp9_highbd_10_sub_pixel_variance16x16, vp9_highbd_10_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits10, @@ -1200,7 +1200,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10, vpx_highbd_sad16x8_avg_bits10, - vp9_highbd_10_variance16x8, + vpx_highbd_10_variance16x8, vp9_highbd_10_sub_pixel_variance16x8, vp9_highbd_10_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits10, @@ -1210,7 +1210,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10, vpx_highbd_sad8x16_avg_bits10, - vp9_highbd_10_variance8x16, + vpx_highbd_10_variance8x16, vp9_highbd_10_sub_pixel_variance8x16, vp9_highbd_10_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits10, @@ -1220,7 +1220,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, - vp9_highbd_10_variance8x8, + vpx_highbd_10_variance8x8, vp9_highbd_10_sub_pixel_variance8x8, vp9_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, @@ -1230,7 +1230,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad8x4_avg_bits10, - vp9_highbd_10_variance8x4, + vpx_highbd_10_variance8x4, vp9_highbd_10_sub_pixel_variance8x4, vp9_highbd_10_sub_pixel_avg_variance8x4, NULL, @@ -1240,7 +1240,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad4x8_avg_bits10, - vp9_highbd_10_variance4x8, + vpx_highbd_10_variance4x8, vp9_highbd_10_sub_pixel_variance4x8, vp9_highbd_10_sub_pixel_avg_variance4x8, NULL, @@ -1250,7 +1250,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, - vp9_highbd_10_variance4x4, + vpx_highbd_10_variance4x4, vp9_highbd_10_sub_pixel_variance4x4, vp9_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, @@ -1262,7 +1262,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, - vp9_highbd_12_variance32x16, + vpx_highbd_12_variance32x16, vp9_highbd_12_sub_pixel_variance32x16, vp9_highbd_12_sub_pixel_avg_variance32x16, NULL, @@ -1272,7 +1272,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12, vpx_highbd_sad16x32_avg_bits12, - vp9_highbd_12_variance16x32, + vpx_highbd_12_variance16x32, vp9_highbd_12_sub_pixel_variance16x32, vp9_highbd_12_sub_pixel_avg_variance16x32, NULL, @@ -1282,7 +1282,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12, vpx_highbd_sad64x32_avg_bits12, - vp9_highbd_12_variance64x32, + vpx_highbd_12_variance64x32, vp9_highbd_12_sub_pixel_variance64x32, vp9_highbd_12_sub_pixel_avg_variance64x32, NULL, @@ -1292,7 +1292,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12, vpx_highbd_sad32x64_avg_bits12, - vp9_highbd_12_variance32x64, + vpx_highbd_12_variance32x64, vp9_highbd_12_sub_pixel_variance32x64, vp9_highbd_12_sub_pixel_avg_variance32x64, NULL, @@ -1302,7 +1302,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12, vpx_highbd_sad32x32_avg_bits12, - vp9_highbd_12_variance32x32, + vpx_highbd_12_variance32x32, vp9_highbd_12_sub_pixel_variance32x32, vp9_highbd_12_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits12, @@ -1312,7 +1312,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12, vpx_highbd_sad64x64_avg_bits12, - vp9_highbd_12_variance64x64, + vpx_highbd_12_variance64x64, vp9_highbd_12_sub_pixel_variance64x64, vp9_highbd_12_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits12, @@ -1322,7 +1322,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12, vpx_highbd_sad16x16_avg_bits12, - vp9_highbd_12_variance16x16, + vpx_highbd_12_variance16x16, vp9_highbd_12_sub_pixel_variance16x16, vp9_highbd_12_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits12, @@ -1332,7 +1332,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12, vpx_highbd_sad16x8_avg_bits12, - vp9_highbd_12_variance16x8, + vpx_highbd_12_variance16x8, vp9_highbd_12_sub_pixel_variance16x8, vp9_highbd_12_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits12, @@ -1342,7 +1342,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12, vpx_highbd_sad8x16_avg_bits12, - vp9_highbd_12_variance8x16, + vpx_highbd_12_variance8x16, vp9_highbd_12_sub_pixel_variance8x16, vp9_highbd_12_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits12, @@ -1352,7 +1352,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, - vp9_highbd_12_variance8x8, + vpx_highbd_12_variance8x8, vp9_highbd_12_sub_pixel_variance8x8, vp9_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, @@ -1362,7 +1362,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad8x4_avg_bits12, - vp9_highbd_12_variance8x4, + vpx_highbd_12_variance8x4, vp9_highbd_12_sub_pixel_variance8x4, vp9_highbd_12_sub_pixel_avg_variance8x4, NULL, @@ -1372,7 +1372,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad4x8_avg_bits12, - vp9_highbd_12_variance4x8, + vpx_highbd_12_variance4x8, vp9_highbd_12_sub_pixel_variance4x8, vp9_highbd_12_sub_pixel_avg_variance4x8, NULL, @@ -1382,7 +1382,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, - vp9_highbd_12_variance4x4, + vpx_highbd_12_variance4x4, vp9_highbd_12_sub_pixel_variance4x4, vp9_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, @@ -1805,61 +1805,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, - vp9_variance32x16, vp9_sub_pixel_variance32x16, + vpx_variance32x16, vp9_sub_pixel_variance32x16, vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, - vp9_variance16x32, vp9_sub_pixel_variance16x32, + vpx_variance16x32, vp9_sub_pixel_variance16x32, vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, - vp9_variance64x32, vp9_sub_pixel_variance64x32, + vpx_variance64x32, vp9_sub_pixel_variance64x32, vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, - vp9_variance32x64, vp9_sub_pixel_variance32x64, + vpx_variance32x64, vp9_sub_pixel_variance32x64, vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, - vp9_variance32x32, vp9_sub_pixel_variance32x32, + vpx_variance32x32, vp9_sub_pixel_variance32x32, vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, - vp9_variance64x64, vp9_sub_pixel_variance64x64, + vpx_variance64x64, vp9_sub_pixel_variance64x64, vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, - vp9_variance16x16, vp9_sub_pixel_variance16x16, + vpx_variance16x16, vp9_sub_pixel_variance16x16, vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, - vp9_variance16x8, vp9_sub_pixel_variance16x8, + vpx_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, - vp9_variance8x16, vp9_sub_pixel_variance8x16, + vpx_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, - vp9_variance8x8, vp9_sub_pixel_variance8x8, + vpx_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, - vp9_variance8x4, vp9_sub_pixel_variance8x4, + vpx_variance8x4, vp9_sub_pixel_variance8x4, vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, - vp9_variance4x8, vp9_sub_pixel_variance4x8, + vpx_variance4x8, vp9_sub_pixel_variance4x8, vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, - vp9_variance4x4, vp9_sub_pixel_variance4x4, + vpx_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d) @@ -2079,7 +2079,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vp9_mse16x16(pa, a_stride, pb, b_stride, &sse); + vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; pa += 16; @@ -2124,21 +2124,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, unsigned int sse = 0; int sum = 0; if (dw > 0) { - highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, - dw, height, &sse, &sum); + highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height, &sse, &sum); total_sse += sse; } if (dh > 0) { - highbd_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); + highbd_8_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); total_sse += sse; } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; for (x = 0; x < width / 16; ++x) { - vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse); + vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; pa += 16; pb += 16; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 88b1030..bfe568e 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -12,6 +12,7 @@ #include #include +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx_mem/vpx_mem.h" @@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) { static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_8X8: - return vp9_mse8x8; + return vpx_mse8x8; case BLOCK_16X8: - return vp9_mse16x8; + return vpx_mse16x8; case BLOCK_8X16: - return vp9_mse8x16; + return vpx_mse8x16; default: - return vp9_mse16x16; + return vpx_mse16x16; } } @@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, default: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_mse8x8; + return vpx_highbd_8_mse8x8; case BLOCK_16X8: - return vp9_highbd_mse16x8; + return vpx_highbd_8_mse16x8; case BLOCK_8X16: - return vp9_highbd_mse8x16; + return vpx_highbd_8_mse8x16; default: - return vp9_highbd_mse16x16; + return vpx_highbd_8_mse16x16; } break; case 10: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_10_mse8x8; + return vpx_highbd_10_mse8x8; case BLOCK_16X8: - return vp9_highbd_10_mse16x8; + return vpx_highbd_10_mse16x8; case BLOCK_8X16: - return vp9_highbd_10_mse8x16; + return vpx_highbd_10_mse8x16; default: - return vp9_highbd_10_mse16x16; + return vpx_highbd_10_mse16x16; } break; case 12: switch (bsize) { case BLOCK_8X8: - return vp9_highbd_12_mse8x8; + return vpx_highbd_12_mse8x8; case BLOCK_16X8: - return vp9_highbd_12_mse16x8; + return vpx_highbd_12_mse16x8; case BLOCK_8X16: - return vp9_highbd_12_mse8x16; + return vpx_highbd_12_mse8x16; default: - return vp9_highbd_12_mse16x16; + return vpx_highbd_12_mse16x16; } break; } @@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { xd->mi[0]->mbmi.tx_size = use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; vp9_encode_intra_block_plane(x, bsize, 0); - this_error = vp9_get_mb_ss(x->plane[0].src_diff); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { switch (cm->bit_depth) { diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 8bdd428..7cec7b1 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -13,6 +13,7 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -303,13 +304,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); - vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, + vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } } else { @@ -321,7 +322,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd, (void) xd; if (second_pred != NULL) { DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); - vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); besterr = vfp->vf(comp_pred, w, src, src_stride, sse1); } else { besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 1e91715..70aed2e 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -14,6 +14,7 @@ #include #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -215,7 +216,7 @@ static void block_variance(const uint8_t *src, int src_stride, for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { - vp9_get8x8var(src + src_stride * i + j, src_stride, + vpx_get8x8var(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, ref_stride, &sse8x8[k], &sum8x8[k]); *sse += sse8x8[k]; diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c index f38f96d..1f6b083 100644 --- a/vp9/encoder/vp9_variance.c +++ b/vp9/encoder/vp9_variance.c @@ -9,6 +9,7 @@ */ #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" @@ -18,26 +19,6 @@ #include "vp9/encoder/vp9_variance.h" -void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { - int i, j; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - - a += a_stride; - b += b_stride; - } -} - // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement // first-pass of 2-D separable filter. @@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, } } -unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { - unsigned int i, sum = 0; - - for (i = 0; i < 256; ++i) { - sum += src_ptr[i] * src_ptr[i]; - } - - return sum; -} - -#define VAR(W, H) \ -unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} - #define SUBPIX_VAR(W, H) \ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, \ @@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ + return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ @@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ - return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ -} - -void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); + return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ } -void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); -} - -unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum); - return *sse; -} - -unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum); - return *sse; -} - -unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum); - return *sse; -} - -unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum); - return *sse; -} - -VAR(4, 4) SUBPIX_VAR(4, 4) SUBPIX_AVG_VAR(4, 4) -VAR(4, 8) SUBPIX_VAR(4, 8) SUBPIX_AVG_VAR(4, 8) -VAR(8, 4) SUBPIX_VAR(8, 4) SUBPIX_AVG_VAR(8, 4) -VAR(8, 8) SUBPIX_VAR(8, 8) SUBPIX_AVG_VAR(8, 8) -VAR(8, 16) SUBPIX_VAR(8, 16) SUBPIX_AVG_VAR(8, 16) -VAR(16, 8) SUBPIX_VAR(16, 8) SUBPIX_AVG_VAR(16, 8) -VAR(16, 16) SUBPIX_VAR(16, 16) SUBPIX_AVG_VAR(16, 16) -VAR(16, 32) SUBPIX_VAR(16, 32) SUBPIX_AVG_VAR(16, 32) -VAR(32, 16) SUBPIX_VAR(32, 16) SUBPIX_AVG_VAR(32, 16) -VAR(32, 32) SUBPIX_VAR(32, 32) SUBPIX_AVG_VAR(32, 32) -VAR(32, 64) SUBPIX_VAR(32, 64) SUBPIX_AVG_VAR(32, 64) -VAR(64, 32) SUBPIX_VAR(64, 32) SUBPIX_AVG_VAR(64, 32) -VAR(64, 64) SUBPIX_VAR(64, 64) SUBPIX_AVG_VAR(64, 64) -void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride) { - int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} - #if CONFIG_VP9_HIGHBITDEPTH -void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, uint64_t *sse, - uint64_t *sum) { - int i, j; - - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; - } - a += a_stride; - b += b_stride; - } -} - -void highbd_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; -} - -void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); -} - -void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, - int *sum) { - uint64_t sse_long = 0; - uint64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); -} - static void highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, @@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_VAR(W, H) \ -unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} \ -\ -unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (((int64_t)sum * sum) / (W * H)); \ -} - #define HIGHBD_SUBPIX_VAR(W, H) \ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, \ @@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ dst_stride, sse); \ } \ \ @@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ } \ \ @@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ W, dst, dst_stride, sse); \ } @@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ dst_stride, sse); \ } \ \ @@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ } \ \ @@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ BILINEAR_FILTERS_2TAP(yoffset)); \ \ - vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W); \ \ - return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ } -#define HIGHBD_GET_VAR(S) \ -void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} \ -\ -void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ -} - -#define HIGHBD_MSE(W, H) \ -unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} \ -\ -unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ -} - -HIGHBD_GET_VAR(8) -HIGHBD_GET_VAR(16) - -HIGHBD_MSE(16, 16) -HIGHBD_MSE(16, 8) -HIGHBD_MSE(8, 16) -HIGHBD_MSE(8, 8) - -HIGHBD_VAR(4, 4) HIGHBD_SUBPIX_VAR(4, 4) HIGHBD_SUBPIX_AVG_VAR(4, 4) -HIGHBD_VAR(4, 8) HIGHBD_SUBPIX_VAR(4, 8) HIGHBD_SUBPIX_AVG_VAR(4, 8) -HIGHBD_VAR(8, 4) HIGHBD_SUBPIX_VAR(8, 4) HIGHBD_SUBPIX_AVG_VAR(8, 4) -HIGHBD_VAR(8, 8) HIGHBD_SUBPIX_VAR(8, 8) HIGHBD_SUBPIX_AVG_VAR(8, 8) -HIGHBD_VAR(8, 16) HIGHBD_SUBPIX_VAR(8, 16) HIGHBD_SUBPIX_AVG_VAR(8, 16) -HIGHBD_VAR(16, 8) HIGHBD_SUBPIX_VAR(16, 8) HIGHBD_SUBPIX_AVG_VAR(16, 8) -HIGHBD_VAR(16, 16) HIGHBD_SUBPIX_VAR(16, 16) HIGHBD_SUBPIX_AVG_VAR(16, 16) -HIGHBD_VAR(16, 32) HIGHBD_SUBPIX_VAR(16, 32) HIGHBD_SUBPIX_AVG_VAR(16, 32) -HIGHBD_VAR(32, 16) HIGHBD_SUBPIX_VAR(32, 16) HIGHBD_SUBPIX_AVG_VAR(32, 16) -HIGHBD_VAR(32, 32) HIGHBD_SUBPIX_VAR(32, 32) HIGHBD_SUBPIX_AVG_VAR(32, 32) -HIGHBD_VAR(32, 64) HIGHBD_SUBPIX_VAR(32, 64) HIGHBD_SUBPIX_AVG_VAR(32, 64) -HIGHBD_VAR(64, 32) HIGHBD_SUBPIX_VAR(64, 32) HIGHBD_SUBPIX_AVG_VAR(64, 32) -HIGHBD_VAR(64, 64) HIGHBD_SUBPIX_VAR(64, 64) HIGHBD_SUBPIX_AVG_VAR(64, 64) - -void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride) { - int i, j; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - const int tmp = pred[j] + ref[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 53148f2..8fc47a8 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -12,31 +12,64 @@ #define VP9_ENCODER_VP9_VARIANCE_H_ #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" #ifdef __cplusplus extern "C" { #endif -void variance(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, - unsigned int *sse, int *sum); +// TODO(johannkoenig): All functions which depend on +// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp. +static void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} #if CONFIG_VP9_HIGHBITDEPTH -void highbd_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); - -void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int w, int h, - unsigned int *sse, int *sum); +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} #endif typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, @@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable { vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; -void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride); - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, - int width, int height, - const uint8_t *ref, int ref_stride); -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c index 4bc3e7e..29b7b27 100644 --- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c +++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c @@ -13,237 +13,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum); - -static void highbd_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int64_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 2); - *sse = ROUND_POWER_OF_TWO(sse_long, 4); -} - -static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - int w, int h, uint32_t *sse, int *sum, - high_variance_fn_t var_fn, int block_size) { - int i, j; - uint64_t sse_long = 0; - int64_t sum_long = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - sse_long += sse0; - sum_long += sum0; - } - } - *sum = ROUND_POWER_OF_TWO(sum_long, 4); - *sse = ROUND_POWER_OF_TWO(sse_long, 8); -} - - -#define HIGH_GET_VAR(S) \ -void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ -} \ -\ -void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 2); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ -} \ -\ -void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, \ - uint32_t *sse, int *sum) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ - sse, sum); \ - *sum = ROUND_POWER_OF_TWO(*sum, 4); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ -} - -HIGH_GET_VAR(16); -HIGH_GET_VAR(8); - -#undef HIGH_GET_VAR - -#define VAR_FN(w, h, block_size, shift) \ -uint32_t vp9_highbd_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, \ - block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} \ -\ -uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \ - const uint8_t *src8, int src_stride, \ - const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ - int sum; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_12_variance_sse2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ - return *sse - (((int64_t)sum * sum) >> shift); \ -} - -VAR_FN(64, 64, 16, 12); -VAR_FN(64, 32, 16, 11); -VAR_FN(32, 64, 16, 11); -VAR_FN(32, 32, 16, 10); -VAR_FN(32, 16, 16, 9); -VAR_FN(16, 32, 16, 9); -VAR_FN(16, 16, 16, 8); -VAR_FN(16, 8, 8, 7); -VAR_FN(8, 16, 8, 7); -VAR_FN(8, 8, 8, 6); - -#undef VAR_FN - -unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_highbd_calc16x16var_sse2, 16); - return *sse; -} - -unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - -unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, - const uint8_t *ref8, int ref_stride, - unsigned int *sse) { - int sum; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_highbd_calc8x8var_sse2, 8); - return *sse; -} - #define DECL(w, opt) \ int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c index ff9f7cc..8cd071d 100644 --- a/vp9/encoder/x86/vp9_variance_avx2.c +++ b/vp9/encoder/x86/vp9_variance_avx2.c @@ -13,18 +13,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, @@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int height, unsigned int *sseptr); -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_avx2 var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, - &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - - -unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_get16x16var_avx2, 16); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; -} - -unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 10); -} - -unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 12); -} - -unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vp9_get32x32var_avx2, 32); - return *sse - (((int64_t)sum * sum) >> 11); -} - unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, int src_stride, int x_offset, diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index cacee74..961efe3 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -16,299 +16,6 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*variance_fn_t)(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); - -unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { - __m128i vsum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); - vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; - } - - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - return _mm_cvtsi128_si32(vsum); -} - -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) - -static void get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), - _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); -} - -void vp9_get8x8var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; - - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( - (const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} - -void vp9_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; - - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); - - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - - src += src_stride; - ref += ref_stride; - } - - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0) + - (int16_t)_mm_extract_epi16(vsum, 1); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} - - -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - int w, int h, unsigned int *sse, int *sum, - variance_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, - ref + ref_stride * i + j, ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } -} - -unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 4); -} - -unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, - sse, &sum, get4x4var_sse2, 4); - return *sse - (((unsigned int)sum * sum) >> 5); -} - -unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, - sse, &sum, get4x4var_sse2, 4); - return *sse - (((unsigned int)sum * sum) >> 5); -} - -unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 6); -} - -unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, - sse, &sum, vp9_get8x8var_sse2, 8); - return *sse - (((unsigned int)sum * sum) >> 7); -} - -unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, - sse, &sum, vp9_get8x8var_sse2, 8); - return *sse - (((unsigned int)sum * sum) >> 7); -} - -unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 10); -} - -unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 9); -} - -unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 12); -} - -unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 11); -} - -unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((int64_t)sum * sum) >> 11); -} - -unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - -unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); - return *sse; -} - // The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index bd0d18c..5415215 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c endif diff --git a/vpx_dsp/arm/variance_media.asm b/vpx_dsp/arm/variance_media.asm new file mode 100644 index 0000000..4372f53 --- /dev/null +++ b/vpx_dsp/arm/variance_media.asm @@ -0,0 +1,363 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_variance16x16_media| + EXPORT |vpx_variance8x8_media| + EXPORT |vpx_mse16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance16x16_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop16x16 + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop16x16 + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vpx_variance8x8_media| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop8x8 + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop8x8 + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + + END + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on vpx_variance16x16_media. In this function, sum is never used. +; So, we can remove this part of calculation. + +|vpx_mse16x16_media| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loopmse + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loopmse + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c new file mode 100644 index 0000000..1a9792e --- /dev/null +++ b/vpx_dsp/arm/variance_neon.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, uint32_t *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vpx_get8x8var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void vpx_get16x16var_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 +} + +unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 +} + +unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 +} + +unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, + b + (32 * b_stride), b_stride, 32, 32, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 +} + +unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, + b + (16 * b_stride), b_stride, 64, 16, + &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, + b + (16 * 2 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, + b + (16 * 3 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 +} + +unsigned int vpx_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vpx_mse16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int vpx_get4x4sse_cs_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 9783e43..c0c3ff9 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, return sad; } +// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up. /* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred. * The function averages every corresponding element of the buffers and stores * the value in a third buffer, comp_pred. diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c new file mode 100644 index 0000000..084dd7b --- /dev/null +++ b/vpx_dsp/variance.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +unsigned int vpx_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +static void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#define VAR(W, H) \ +unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ +void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse, int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +} + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ +unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ +} + +VAR(64, 64) +VAR(64, 32) +VAR(32, 64) +VAR(32, 32) +VAR(32, 16) +VAR(16, 32) +VAR(16, 16) +VAR(16, 8) +VAR(8, 16) +VAR(8, 8) +VAR(8, 4) +VAR(4, 8) +VAR(4, 4) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, uint64_t *sse, uint64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + uint64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ +unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} \ +\ +unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define HIGHBD_GET_VAR(S) \ +void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +} + +#define HIGHBD_MSE(W, H) \ +unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} \ +\ +unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ +} + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +HIGHBD_VAR(64, 64) +HIGHBD_VAR(64, 32) +HIGHBD_VAR(32, 64) +HIGHBD_VAR(32, 32) +HIGHBD_VAR(32, 16) +HIGHBD_VAR(16, 32) +HIGHBD_VAR(16, 16) +HIGHBD_VAR(16, 8) +HIGHBD_VAR(8, 16) +HIGHBD_VAR(8, 8) +HIGHBD_VAR(8, 4) +HIGHBD_VAR(4, 8) +HIGHBD_VAR(4, 4) + +void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 606515d..f23534a 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c + DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm @@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm + endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS +ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) +DSP_SRCS-yes += variance.c + +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c + +DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c +DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c + +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +endif # CONFIG_VP9_HIGHBITDEPTH +endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ebec9ec..55271cf 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_ENCODERS +if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { + +add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x64 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance64x32 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x64 sse2 neon/; + +add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x32 sse2 avx2 neon/; + +add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance32x16 sse2 avx2/; + +add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x32 sse2/; + +add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/; + +add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance16x8 mmx sse2 neon/; + +add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x16 mmx sse2 neon/; + +add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x8 mmx sse2 media neon/; + +add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance8x4 sse2/; + +add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x8 sse2/; + +add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_variance4x4 mmx sse2/; + + +add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get16x16var sse2 avx2 neon/; + +add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + specialize qw/vpx_get8x8var mmx sse2 neon/; + +add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/; + +add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse16x8 sse2/; + +add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse8x16 sse2/; + +add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_mse8x8 sse2/; + +add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; + specialize qw/vpx_get_mb_ss mmx sse2/; + +add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; + specialize qw/vpx_get4x4sse_cs neon/; + +add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + +if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance64x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance64x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x64 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance32x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x32 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance16x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_variance8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + + add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_8_mse8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_10_mse8x8 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse16x16 sse2/; + + add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/vpx_highbd_12_mse8x8 sse2/; + + add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; +} # CONFIG_VP9_HIGHBITDEPTH +} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC + 1; diff --git a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_variance_impl_sse2.asm similarity index 97% rename from vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm rename to vpx_dsp/x86/highbd_variance_impl_sse2.asm index 821dd06..923418a 100644 --- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -11,7 +11,7 @@ %include "vpx_ports/x86_abi_support.asm" -;unsigned int vp9_highbd_calc16x16var_sse2 +;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, @@ -20,8 +20,8 @@ ; unsigned int * SSE, ; int * Sum ;) -global sym(vp9_highbd_calc16x16var_sse2) PRIVATE -sym(vp9_highbd_calc16x16var_sse2): +global sym(vpx_highbd_calc16x16var_sse2) PRIVATE +sym(vpx_highbd_calc16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2): ret -;unsigned int vp9_highbd_calc8x8var_sse2 +;unsigned int vpx_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, ; int source_stride, @@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2): ; unsigned int * SSE, ; int * Sum ;) -global sym(vp9_highbd_calc8x8var_sse2) PRIVATE -sym(vp9_highbd_calc8x8var_sse2): +global sym(vpx_highbd_calc8x8var_sse2) PRIVATE +sym(vpx_highbd_calc8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 0000000..343c047 --- /dev/null +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +\ +uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vpx_highbd_calc8x8var_sse2, 8); + return *sse; +} diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c new file mode 100644 index 0000000..82cef4a --- /dev/null +++ b/vpx_dsp/x86/variance_avx2.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vpx_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + + +unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vpx_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; +} + +unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); +} diff --git a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c similarity index 98% rename from vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c rename to vpx_dsp/x86/variance_impl_avx2.c index ee76a31..0e40959 100644 --- a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c +++ b/vpx_dsp/x86/variance_impl_avx2.c @@ -10,9 +10,9 @@ #include // AVX2 -#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" -void vp9_get16x16var_avx2(const unsigned char *src_ptr, +void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, @@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr, } } -void vp9_get32x32var_avx2(const unsigned char *src_ptr, +void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm new file mode 100644 index 0000000..a8d7d99 --- /dev/null +++ b/vpx_dsp/x86/variance_impl_mmx.asm @@ -0,0 +1,424 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) +global sym(vpx_get_mb_ss_mmx) PRIVATE +sym(vpx_get_mb_ss_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + sub rsp, 8 + ; end prolog + + mov rax, arg(0) ;src_ptr + mov rcx, 16 + pxor mm4, mm4 + +.NEXTROW: + movq mm0, [rax] + movq mm1, [rax+8] + movq mm2, [rax+16] + movq mm3, [rax+24] + pmaddwd mm0, mm0 + pmaddwd mm1, mm1 + pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + + paddd mm4, mm0 + paddd mm4, mm1 + paddd mm4, mm2 + paddd mm4, mm3 + + add rax, 32 + dec rcx + ja .NEXTROW + movq QWORD PTR [rsp], mm4 + + ;return sum[0]+sum[1]; + movsxd rax, dword ptr [rsp] + movsxd rcx, dword ptr [rsp+4] + add rax, rcx + + + ; begin epilog + add rsp, 8 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vpx_get8x8var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get8x8var_mmx) PRIVATE +sym(vpx_get8x8var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm1, [rbx] ; Copy eight bytes to mm1 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + + ; Row 2 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 3 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 4 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 5 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + ; movq mm4, [rbx + rdx] + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 6 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 7 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movq mm1, [rbx] ; Copy eight bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Row 8 + movq mm0, [rax] ; Copy eight bytes to mm0 + movq mm2, mm0 ; Take copies + movq mm3, mm1 ; Take copies + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + punpckhbw mm2, mm6 ; unpack to higher prrcision + punpckhbw mm3, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + psubsw mm2, mm3 ; A-B (high order) to MM2 + + paddw mm5, mm0 ; accumulate differences in mm5 + paddw mm5, mm2 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + pmaddwd mm2, mm2 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + paddd mm7, mm0 ; accumulate in mm7 + paddd mm7, mm2 ; accumulate in mm7 + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;void +;vpx_get4x4var_mmx +;( +; unsigned char *src_ptr, +; int source_stride, +; unsigned char *ref_ptr, +; int recon_stride, +; unsigned int *SSE, +; int *Sum +;) +global sym(vpx_get4x4var_mmx) PRIVATE +sym(vpx_get4x4var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + push rbx + sub rsp, 16 + ; end prolog + + + pxor mm5, mm5 ; Blank mmx6 + pxor mm6, mm6 ; Blank mmx7 + pxor mm7, mm7 ; Blank mmx7 + + mov rax, arg(0) ;[src_ptr] ; Load base addresses + mov rbx, arg(2) ;[ref_ptr] + movsxd rcx, dword ptr arg(1) ;[source_stride] + movsxd rdx, dword ptr arg(3) ;[recon_stride] + + ; Row 1 + movd mm0, [rax] ; Copy four bytes to mm0 + movd mm1, [rbx] ; Copy four bytes to mm1 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + + ; Row 2 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 3 + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher precision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + add rbx,rdx ; Inc pointer into ref data + add rax,rcx ; Inc pointer into the new data + movd mm1, [rbx] ; Copy four bytes to mm1 + paddd mm7, mm0 ; accumulate in mm7 + + ; Row 4 + movd mm0, [rax] ; Copy four bytes to mm0 + + punpcklbw mm0, mm6 ; unpack to higher prrcision + punpcklbw mm1, mm6 + psubsw mm0, mm1 ; A-B (low order) to MM0 + + paddw mm5, mm0 ; accumulate differences in mm5 + + pmaddwd mm0, mm0 ; square and accumulate + paddd mm7, mm0 ; accumulate in mm7 + + + ; Now accumulate the final results. + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory + movsx rdx, WORD PTR [rsp+8] + movsx rcx, WORD PTR [rsp+10] + movsx rbx, WORD PTR [rsp+12] + movsx rax, WORD PTR [rsp+14] + add rdx, rcx + add rbx, rax + add rdx, rbx ;XSum + movsxd rax, DWORD PTR [rsp] + movsxd rcx, DWORD PTR [rsp+4] + add rax, rcx ;XXSum + mov rsi, arg(4) ;SSE + mov rdi, arg(5) ;Sum + mov dword ptr [rsi], eax + mov dword ptr [rdi], edx + xor rax, rax ; return 0 + + + ; begin epilog + add rsp, 16 + pop rbx + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c new file mode 100644 index 0000000..99dd741 --- /dev/null +++ b/vpx_dsp/x86/variance_mmx.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + return (var - (((unsigned int)avg * avg) >> 4)); +} + +unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int var; + int avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg); + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 6)); +} + +unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + *sse = var; + return var; +} + +unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, sse2, sse3, var; + int sum0, sum1, sum2, sum3, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse2, &sum2); + vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride, + b + 8 * b_stride + 8, b_stride, &sse3, &sum3); + + var = sse0 + sse1 + sse2 + sse3; + avg = sum0 + sum1 + sum2 + sum3; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 8)); +} + +unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + return (var - (((unsigned int)avg * avg) >> 7)); +} + +unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, + unsigned int *sse) { + unsigned int sse0, sse1, var; + int sum0, sum1, avg; + + vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0); + vpx_get8x8var_mmx(a + 8 * a_stride, a_stride, + b + 8 * b_stride, b_stride, &sse1, &sum1); + + var = sse0 + sse1; + avg = sum0 + sum1; + *sse = var; + + return (var - (((unsigned int)avg * avg) >> 7)); +} diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c new file mode 100644 index 0000000..6256bc5 --- /dev/null +++ b/vpx_dsp/x86/variance_sse2.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_ports/mem.h" + +typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); +} + +unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); +} + +unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vpx_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); +} + +unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vpx_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +}