X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=src%2Fthird_party%2Flibvpx%2Fsource%2Flibvpx%2Fvp9%2Fencoder%2Fx86%2Fvp9_variance_sse2.c;h=b4d2b0ac408d41fcfab53db5f045a6fb68cdd2b3;hb=3545e9f2671f595d2a2f3ee75ca0393b01e35ef6;hp=e935a233a896f967a96c5632a6d7cef59c2c801b;hpb=7d210d4c7e9ba36e635eabc5b5780495f8a63292;p=platform%2Fframework%2Fweb%2Fcrosswalk.git diff --git a/src/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/src/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index e935a23..b4d2b0a 100644 --- a/src/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/src/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" @@ -17,18 +19,137 @@ typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse, int *sum); -unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} -unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); + + return 0; +} + +unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); + + return 0; +} + +unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); + + return 0; +} -unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); static void variance_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, @@ -55,8 +176,7 @@ unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 4, - sse, &sum, vp9_get4x4var_mmx, 4); + vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); return *sse - (((unsigned int)sum * sum) >> 4); } @@ -65,7 +185,7 @@ unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_sse2(src, src_stride, ref, ref_stride, 8, 4, - sse, &sum, vp9_get4x4var_mmx, 4); + sse, &sum, vp9_get4x4var_sse2, 4); return *sse - (((unsigned int)sum * sum) >> 5); } @@ -74,7 +194,7 @@ unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, unsigned int *sse) { int sum; variance_sse2(src, src_stride, ref, ref_stride, 4, 8, - sse, &sum, vp9_get4x4var_mmx, 4); + sse, &sum, vp9_get4x4var_sse2, 4); return *sse - (((unsigned int)sum * sum) >> 5); } @@ -82,8 +202,7 @@ unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 8, - sse, &sum, vp9_get8x8var_sse2, 8); + vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); return *sse - (((unsigned int)sum * sum) >> 6); } @@ -109,17 +228,8 @@ unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse) { int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 16, - sse, &sum, vp9_get16x16var_sse2, 16); - return *sse - (((unsigned int)sum * sum) >> 8); -} - -unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - return *sse; + return *sse - (((unsigned int)sum * sum) >> 8); } unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, @@ -176,6 +286,34 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, return *sse - (((int64_t)sum * sum) >> 11); } +unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + #define DECL(w, opt) \ int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ ptrdiff_t src_stride, \