From 4635b0fced6df4b371454b52cfd512c14eec1f76 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 30 Oct 2018 14:43:36 -0700 Subject: [PATCH] clang-tidy: normalize variance functions Always use src/ref and _ptr/_stride suffixes. Normalize to [xy]_offset and second_pred. Drop some stray source/recon_strides. BUG=webm:1444 Change-Id: I32362a50988eb84464ab78686348610ea40e5c80 --- vpx_dsp/arm/subpel_variance_neon.c | 104 ++-- vpx_dsp/arm/variance_neon.c | 170 +++--- vpx_dsp/mips/sub_pixel_variance_msa.c | 54 +- vpx_dsp/mips/variance_mmi.c | 639 ++++++++++++----------- vpx_dsp/ppc/variance_vsx.c | 111 ++-- vpx_dsp/variance.c | 557 ++++++++++---------- vpx_dsp/variance.h | 39 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 362 ++++++------- vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm | 264 +++++----- vpx_dsp/x86/highbd_variance_impl_sse2.asm | 16 +- vpx_dsp/x86/highbd_variance_sse2.c | 78 +-- vpx_dsp/x86/subpel_variance_sse2.asm | 220 ++++---- vpx_dsp/x86/variance_avx2.c | 275 +++++----- vpx_dsp/x86/variance_sse2.c | 353 +++++++------ 14 files changed, 1648 insertions(+), 1594 deletions(-) diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c index 4f58a78..37bfd1c 100644 --- a/vpx_dsp/arm/subpel_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -97,30 +97,30 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define sub_pixel_varianceNxM(n, m) \ - uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else { \ - var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[yoffset]); \ - } \ - return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \ +#define sub_pixel_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else { \ + var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[y_offset]); \ + } \ + return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse); \ } sub_pixel_varianceNxM(4, 4); @@ -139,34 +139,34 @@ sub_pixel_varianceNxM(64, 64); // 4xM filter writes an extra row to fdata because it processes two rows at a // time. -#define sub_pixel_avg_varianceNxM(n, m) \ - uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ - uint8_t temp1[n * m]; \ - \ - if (n == 4) { \ - var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else if (n == 8) { \ - var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ - bilinear_filters[yoffset]); \ - } else { \ - var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ - bilinear_filters[yoffset]); \ - } \ - \ - vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ - \ - return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \ +#define sub_pixel_avg_varianceNxM(n, m) \ + uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ + uint8_t temp1[n * m]; \ + \ + if (n == 4) { \ + var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else if (n == 8) { \ + var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1), \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ + bilinear_filters[y_offset]); \ + } else { \ + var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \ + bilinear_filters[x_offset]); \ + var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ + bilinear_filters[y_offset]); \ + } \ + \ + vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ + \ + return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse); \ } sub_pixel_avg_varianceNxM(4, 4); diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c index 61c2c16..77b1015 100644 --- a/vpx_dsp/arm/variance_neon.c +++ b/vpx_dsp/arm/variance_neon.c @@ -27,8 +27,9 @@ // this limit. // Process a block of width 4 four rows at a time. -static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int h, uint32_t *sse, int *sum) { +static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { int i; int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_lo_s32 = vdupq_n_s32(0); @@ -38,8 +39,8 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, assert(h <= 256); for (i = 0; i < h; i += 4) { - const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride); - const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride); + const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride); + const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride); const uint16x8_t diff_lo_u16 = vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); const uint16x8_t diff_hi_u16 = @@ -61,8 +62,8 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), vget_high_s16(diff_hi_s16)); - a += 4 * a_stride; - b += 4 * b_stride; + src_ptr += 4 * src_stride; + ref_ptr += 4 * ref_stride; } *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); @@ -72,9 +73,9 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b, } // Process a block of any size where the width is divisible by 16. -static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { +static void variance_neon_w16(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { int i, j; int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_lo_s32 = vdupq_n_s32(0); @@ -86,8 +87,8 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 16) { - const uint8x16_t a_u8 = vld1q_u8(a + j); - const uint8x16_t b_u8 = vld1q_u8(b + j); + const uint8x16_t a_u8 = vld1q_u8(src_ptr + j); + const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j); const uint16x8_t diff_lo_u16 = vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8)); @@ -110,8 +111,8 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16), vget_high_s16(diff_hi_s16)); } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0); @@ -121,8 +122,9 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b, } // Process a block of width 8 two rows at a time. -static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int h, uint32_t *sse, int *sum) { +static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int h, + uint32_t *sse, int *sum) { int i = 0; int16x8_t sum_s16 = vdupq_n_s16(0); int32x4_t sse_lo_s32 = vdupq_n_s32(0); @@ -132,10 +134,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, assert(h <= 128); do { - const uint8x8_t a_0_u8 = vld1_u8(a); - const uint8x8_t a_1_u8 = vld1_u8(a + a_stride); - const uint8x8_t b_0_u8 = vld1_u8(b); - const uint8x8_t b_1_u8 = vld1_u8(b + b_stride); + const uint8x8_t a_0_u8 = vld1_u8(src_ptr); + const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride); + const uint8x8_t b_0_u8 = vld1_u8(ref_ptr); + const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride); const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8); const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8); const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16); @@ -150,8 +152,8 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, vget_high_s16(diff_0_s16)); sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16), vget_high_s16(diff_1_s16)); - a += a_stride + a_stride; - b += b_stride + b_stride; + src_ptr += src_stride + src_stride; + ref_ptr += ref_stride + ref_stride; i += 2; } while (i < h); @@ -161,31 +163,36 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b, 0); } -void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum); } -void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, unsigned int *sse, int *sum) { - variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum); } -#define varianceNxM(n, m, shift) \ - unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ - int sum; \ - if (n == 4) \ - variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum); \ - else if (n == 8) \ - variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \ - else \ - variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \ - if (n * m < 16 * 16) \ - return *sse - ((sum * sum) >> shift); \ - else \ - return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ +#define varianceNxM(n, m, shift) \ + unsigned int vpx_variance##n##x##m##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, unsigned int *sse) { \ + int sum; \ + if (n == 4) \ + variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ + &sum); \ + else if (n == 8) \ + variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse, \ + &sum); \ + else \ + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \ + &sum); \ + if (n * m < 16 * 16) \ + return *sse - ((sum * sum) >> shift); \ + else \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ } varianceNxM(4, 4, 4); @@ -199,58 +206,66 @@ varianceNxM(16, 32, 9); varianceNxM(32, 16, 9); varianceNxM(32, 32, 10); -unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); - variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride), - b_stride, 32, 32, &sse2, &sum2); + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1, + &sum1); + variance_neon_w16(src_ptr + (32 * src_stride), src_stride, + ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2, + &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); } -unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, + &sum1); + variance_neon_w16(src_ptr + (16 * src_stride), src_stride, + ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); } -unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum1, sum2; uint32_t sse1, sse2; - variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); - variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1, + &sum1); + variance_neon_w16(src_ptr + (16 * src_stride), src_stride, + ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride, + ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); sse1 += sse2; sum1 += sum2; - variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), - b_stride, 64, 16, &sse2, &sum2); + variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride, + ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2, + &sum2); *sse = sse1 + sse2; sum1 += sum2; return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); } -unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, +unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride, + const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; @@ -267,13 +282,13 @@ unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, for (i = 0; i < 8; i++) { // mse16x16_neon_loop q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; q1u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; q2u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); @@ -312,10 +327,9 @@ unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride, return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); } -unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, - int source_stride, +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, - int recon_stride) { + int ref_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; @@ -324,21 +338,21 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int64x2_t q1s64; d0u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d4u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; d1u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d5u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; d2u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d6u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; d3u8 = vld1_u8(src_ptr); - src_ptr += source_stride; + src_ptr += src_stride; d7u8 = vld1_u8(ref_ptr); - ref_ptr += recon_stride; + ref_ptr += ref_stride; q11u16 = vsubl_u8(d0u8, d4u8); q12u16 = vsubl_u8(d1u8, d5u8); diff --git a/vpx_dsp/mips/sub_pixel_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c index 313e06f..6c2649d 100644 --- a/vpx_dsp/mips/sub_pixel_variance_msa.c +++ b/vpx_dsp/mips/sub_pixel_variance_msa.c @@ -1619,16 +1619,16 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \ - const uint8_t *src, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *src, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \ uint32_t *sse) { \ int32_t diff; \ uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ } else { \ @@ -1638,7 +1638,7 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( \ var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ \ @@ -1672,15 +1672,15 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ v_filter, ht, &diff); \ @@ -1690,7 +1690,7 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); &diff); \ } \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ &diff); \ @@ -1719,16 +1719,16 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32); uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t src_stride, - int32_t xoffset, int32_t yoffset, + int32_t x_offset, int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, uint32_t *sse, const uint8_t *sec_pred) { int32_t diff; - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; - if (yoffset) { - if (xoffset) { + if (y_offset) { + if (x_offset) { *sse = sub_pixel_avg_sse_diff_32width_hv_msa( src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, v_filter, 64, &diff); @@ -1738,7 +1738,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, v_filter, 64, &diff); } } else { - if (xoffset) { + if (x_offset) { *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, 64, &diff); @@ -1753,15 +1753,15 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \ - const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ - int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \ + int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \ + const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \ \ - if (yoffset) { \ - if (xoffset) { \ + if (y_offset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ v_filter, ht, &diff); \ @@ -1771,7 +1771,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, &diff); \ } \ } else { \ - if (xoffset) { \ + if (x_offset) { \ *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ &diff); \ diff --git a/vpx_dsp/mips/variance_mmi.c b/vpx_dsp/mips/variance_mmi.c index 4af60d3..88908e0 100644 --- a/vpx_dsp/mips/variance_mmi.c +++ b/vpx_dsp/mips/variance_mmi.c @@ -87,10 +87,10 @@ static const uint8_t bilinear_filters[8][2] = { "paddh %[ftmp12], %[ftmp12], %[ftmp6] \n\t" #define VARIANCE_SSE_8 \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" \ - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" \ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ @@ -101,10 +101,10 @@ static const uint8_t bilinear_filters[8][2] = { #define VARIANCE_SSE_16 \ VARIANCE_SSE_8 \ - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" \ - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" \ + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" \ "pasubub %[ftmp3], %[ftmp1], %[ftmp2] \n\t" \ "punpcklbh %[ftmp4], %[ftmp3], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp3], %[ftmp0] \n\t" \ @@ -115,11 +115,11 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A \ /* calculate fdata3[0]~fdata3[3], store at ftmp2*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ "paddh %[ftmp2], %[ftmp2], %[ff_ph_40] \n\t" \ @@ -129,11 +129,11 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B \ /* calculate fdata3[0]~fdata3[3], store at ftmp4*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ "paddh %[ftmp4], %[ftmp4], %[ff_ph_40] \n\t" \ @@ -169,12 +169,12 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp2], %[ftmp2], %[filter_x0] \n\t" \ @@ -190,12 +190,12 @@ static const uint8_t bilinear_filters[8][2] = { #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/ \ - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp8], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp9], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x08(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x01(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x01(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp8], %[ftmp8], %[filter_x0] \n\t" \ @@ -258,12 +258,12 @@ static const uint8_t bilinear_filters[8][2] = { VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A \ \ /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/ \ - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp4], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp7], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp4], %[ftmp4], %[filter_x0] \n\t" \ @@ -282,12 +282,12 @@ static const uint8_t bilinear_filters[8][2] = { VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B \ \ /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/ \ - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp10], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp11], %[ftmp1], %[ftmp0] \n\t" \ - "gsldlc1 %[ftmp1], 0x10(%[a]) \n\t" \ - "gsldrc1 %[ftmp1], 0x09(%[a]) \n\t" \ + "gsldlc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" \ + "gsldrc1 %[ftmp1], 0x09(%[src_ptr]) \n\t" \ "punpcklbh %[ftmp12], %[ftmp1], %[ftmp0] \n\t" \ "punpckhbh %[ftmp13], %[ftmp1], %[ftmp0] \n\t" \ "pmullh %[ftmp10], %[ftmp10], %[filter_x0] \n\t" \ @@ -357,24 +357,23 @@ static const uint8_t bilinear_filters[8][2] = { // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); - ++a; + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } @@ -387,28 +386,27 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - ++a; + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } -static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[12]; @@ -424,57 +422,57 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x27(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x20(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x27(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x20(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x27(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x20(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x27(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x20(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x2f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x28(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x2f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x28(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x2f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x28(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x2f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x28(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x37(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x30(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x37(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x30(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x37(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x30(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x37(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x30(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x3f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x38(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x3f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x38(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x3f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x38(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x3f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x38(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "mfc1 %[tmp1], %[ftmp9] \n\t" @@ -491,9 +489,10 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), [tmp2]"=&r"(tmp[2]), - [a]"+&r"(a), [b]"+&r"(b), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), [sum]"=&r"(sum) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse) : "memory" ); @@ -501,18 +500,19 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (64 * high)); } -#define VPX_VARIANCE64XN(n) \ - uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance64x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE64XN(n) \ + uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE64XN(64) VPX_VARIANCE64XN(32) -uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, uint32_t *sse) { +uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse) { int sum; double ftmp[12]; uint32_t tmp[3]; @@ -527,33 +527,33 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, "xor %[ftmp9], %[ftmp9], %[ftmp9] \n\t" "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 - "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8_FOR_W64 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "mfc1 %[tmp1], %[ftmp9] \n\t" @@ -570,9 +570,10 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), [tmp2]"=&r"(tmp[2]), - [a]"+&r"(a), [b]"+&r"(b), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr), [sum]"=&r"(sum) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [sse]"r"(sse) : "memory" ); @@ -580,8 +581,8 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b, return *sse - (((int64_t)sum * sum) / 2048); } -static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[13]; @@ -598,30 +599,30 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x17(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x10(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x17(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x10(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x17(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x10(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x1f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x18(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x1f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x18(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x1f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x1f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x18(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -646,8 +647,9 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -655,18 +657,18 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (32 * high)); } -#define VPX_VARIANCE32XN(n) \ - uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance32x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE32XN(n) \ + uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE32XN(32) VPX_VARIANCE32XN(16) -static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[13]; @@ -683,20 +685,20 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 - "gsldlc1 %[ftmp1], 0x0f(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x08(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x0f(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x08(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x0f(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x0f(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x08(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -721,8 +723,9 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -730,19 +733,19 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (16 * high)); } -#define VPX_VARIANCE16XN(n) \ - uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance16x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE16XN(n) \ + uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE16XN(32) VPX_VARIANCE16XN(16) VPX_VARIANCE16XN(8) -static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[13]; @@ -759,15 +762,15 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, "xor %[ftmp10], %[ftmp10], %[ftmp10] \n\t" "xor %[ftmp12], %[ftmp12], %[ftmp12] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -792,8 +795,9 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [ftmp12]"=&f"(ftmp[12]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -801,19 +805,19 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (8 * high)); } -#define VPX_VARIANCE8XN(n) \ - uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance8x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE8XN(n) \ + uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE8XN(16) VPX_VARIANCE8XN(8) VPX_VARIANCE8XN(4) -static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, int high) { int sum; double ftmp[12]; @@ -830,15 +834,15 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" "xor %[ftmp8], %[ftmp8], %[ftmp8] \n\t" "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[a]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[a]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[b]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[b]) \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[ref_ptr]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ref_ptr]) \n\t" VARIANCE_SSE_SUM_4 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp6], %[ftmp10] \n\t" @@ -862,8 +866,9 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum) : "memory" ); @@ -871,19 +876,19 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride, return *sse - (((int64_t)sum * sum) / (4 * high)); } -#define VPX_VARIANCE4XN(n) \ - uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_variance4x(a, a_stride, b, b_stride, sse, n); \ +#define VPX_VARIANCE4XN(n) \ + uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } VPX_VARIANCE4XN(8) VPX_VARIANCE4XN(4) -static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, uint32_t *sse, - uint64_t high) { +static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { double ftmp[12]; uint32_t tmp[1]; @@ -900,8 +905,8 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, VARIANCE_SSE_16 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -914,8 +919,9 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse) : "memory" ); @@ -923,19 +929,19 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride, return *sse; } -#define vpx_mse16xN(n) \ - uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - return vpx_mse16x(a, a_stride, b, b_stride, sse, n); \ +#define vpx_mse16xN(n) \ + uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } vpx_mse16xN(16); vpx_mse16xN(8); -static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, uint32_t *sse, - uint64_t high) { +static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sse, uint64_t high) { double ftmp[12]; uint32_t tmp[1]; @@ -952,8 +958,8 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, VARIANCE_SSE_8 "addiu %[tmp0], %[tmp0], -0x01 \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) - MMI_ADDU(%[b], %[b], %[b_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) + MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride]) "bnez %[tmp0], 1b \n\t" "dsrl %[ftmp9], %[ftmp8], %[ftmp11] \n\t" @@ -966,8 +972,9 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]), - [a]"+&r"(a), [b]"+&r"(b) - : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride), + [src_ptr]"+&r"(src_ptr), [ref_ptr]"+&r"(ref_ptr) + : [src_stride]"r"((mips_reg)src_stride), + [ref_stride]"r"((mips_reg)ref_stride), [high]"r"(&high), [sse]"r"(sse) : "memory" ); @@ -975,28 +982,29 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride, return *sse; } -#define vpx_mse8xN(n) \ - uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - return vpx_mse8x(a, a_stride, b, b_stride, sse, n); \ +#define vpx_mse8xN(n) \ + uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \ } vpx_mse8xN(16); vpx_mse8xN(8); -#define SUBPIX_VAR(W, H) \ - uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR(64, 64) @@ -1006,9 +1014,10 @@ SUBPIX_VAR(32, 32) SUBPIX_VAR(32, 16) SUBPIX_VAR(16, 32) -static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - uint8_t *temp2, int counter) { +static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[15]; @@ -1016,8 +1025,8 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; - const uint8_t *filter_x = bilinear_filters[xoffset]; - const uint8_t *filter_y = bilinear_filters[yoffset]; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" @@ -1031,26 +1040,26 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, // fdata3: fdata3[0] ~ fdata3[15] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A - // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B // temp2: temp2[0] ~ temp2[15] VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A - // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A // temp2+16*1: temp2[0] ~ temp2[15] MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B "1: \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B @@ -1062,43 +1071,44 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride, [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), - [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) : [filter_x0] "f"((uint64_t)filter_x[0]), [filter_x1] "f"((uint64_t)filter_x[1]), [filter_y0] "f"((uint64_t)filter_y[0]), [filter_y1] "f"((uint64_t)filter_y[1]), - [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), [mask] "f"(mask) : "memory" ); } -#define SUBPIX_VAR16XN(H) \ - uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp2[16 * H]; \ - var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \ - (H - 2) / 2); \ - \ - return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse); \ +#define SUBPIX_VAR16XN(H) \ + uint32_t vpx_sub_pixel_variance16x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[16 * H]; \ + var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR16XN(16) SUBPIX_VAR16XN(8) -static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - uint8_t *temp2, int counter) { +static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[15]; mips_reg tmp[2]; DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; - const uint8_t *filter_x = bilinear_filters[xoffset]; - const uint8_t *filter_y = bilinear_filters[yoffset]; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" @@ -1112,26 +1122,26 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, // fdata3: fdata3[0] ~ fdata3[7] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A - // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B // temp2: temp2[0] ~ temp2[7] VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A - // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A // temp2+8*1: temp2[0] ~ temp2[7] MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B "1: \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B @@ -1143,44 +1153,45 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride, [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]), - [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr), + [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) : [filter_x0] "f"((uint64_t)filter_x[0]), [filter_x1] "f"((uint64_t)filter_x[1]), [filter_y0] "f"((uint64_t)filter_y[0]), [filter_y1] "f"((uint64_t)filter_y[1]), - [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), [mask] "f"(mask) : "memory" ); } -#define SUBPIX_VAR8XN(H) \ - uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp2[8 * H]; \ - var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \ - (H - 2) / 2); \ - \ - return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse); \ +#define SUBPIX_VAR8XN(H) \ + uint32_t vpx_sub_pixel_variance8x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[8 * H]; \ + var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR8XN(16) SUBPIX_VAR8XN(8) SUBPIX_VAR8XN(4) -static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - uint8_t *temp2, int counter) { +static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr, + int src_stride, int x_offset, + int y_offset, uint8_t *temp2, + int counter) { uint8_t *temp2_ptr = temp2; mips_reg l_counter = counter; double ftmp[7]; mips_reg tmp[2]; DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL }; - const uint8_t *filter_x = bilinear_filters[xoffset]; - const uint8_t *filter_y = bilinear_filters[yoffset]; + const uint8_t *filter_x = bilinear_filters[x_offset]; + const uint8_t *filter_y = bilinear_filters[y_offset]; __asm__ volatile ( "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" @@ -1193,26 +1204,26 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, // fdata3: fdata3[0] ~ fdata3[3] VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A - // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B // temp2: temp2[0] ~ temp2[7] VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A - // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3] - MMI_ADDU(%[a], %[a], %[a_stride]) + // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3] + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A // temp2+4*1: temp2[0] ~ temp2[7] MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B "1: \n\t" - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A - MMI_ADDU(%[a], %[a], %[a_stride]) + MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride]) VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04) VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B @@ -1220,49 +1231,49 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride, "bnez %[counter], 1b \n\t" : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]), - [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), + [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter) : [filter_x0] "f"((uint64_t)filter_x[0]), [filter_x1] "f"((uint64_t)filter_x[1]), [filter_y0] "f"((uint64_t)filter_y[0]), [filter_y1] "f"((uint64_t)filter_y[1]), - [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40), + [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40), [mask] "f"(mask) : "memory" ); } -#define SUBPIX_VAR4XN(H) \ - uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint8_t temp2[4 * H]; \ - var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \ - (H - 2) / 2); \ - \ - return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse); \ +#define SUBPIX_VAR4XN(H) \ + uint32_t vpx_sub_pixel_variance4x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint8_t temp2[4 * H]; \ + var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \ + (H - 2) / 2); \ + \ + return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse); \ } SUBPIX_VAR4XN(8) SUBPIX_VAR4XN(4) -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ - \ - return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse); \ } SUBPIX_AVG_VAR(64, 64) diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c index 50311d1..8926160 100644 --- a/vpx_dsp/ppc/variance_vsx.c +++ b/vpx_dsp/ppc/variance_vsx.c @@ -14,14 +14,16 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/ppc/types_vsx.h" -uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride) { +uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int distortion; - const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride)); - const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride)); - const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride)); - const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride)); + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t a1 = + unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); + const int16x8_t b1 = + unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride)); const int16x8_t d0 = vec_sub(a0, b0); const int16x8_t d1 = vec_sub(a1, b1); const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0))); @@ -33,12 +35,12 @@ uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, } // TODO(lu_zero): Unroll -uint32_t vpx_get_mb_ss_vsx(const int16_t *a) { +uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) { unsigned int i, sum = 0; int32x4_t s = vec_splat_s32(0); for (i = 0; i < 256; i += 8) { - const int16x8_t v = vec_vsx_ld(0, a + i); + const int16x8_t v = vec_vsx_ld(0, src_ptr + i); s = vec_msum(v, v, s); } @@ -96,15 +98,16 @@ void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width, } } -static INLINE void variance_inner_32(const uint8_t *a, const uint8_t *b, +static INLINE void variance_inner_32(const uint8_t *src_ptr, + const uint8_t *ref_ptr, int32x4_t *sum_squared, int32x4_t *sum) { int32x4_t s = *sum; int32x4_t ss = *sum_squared; - const uint8x16_t va0 = vec_vsx_ld(0, a); - const uint8x16_t vb0 = vec_vsx_ld(0, b); - const uint8x16_t va1 = vec_vsx_ld(16, a); - const uint8x16_t vb1 = vec_vsx_ld(16, b); + const uint8x16_t va0 = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr); + const uint8x16_t va1 = vec_vsx_ld(16, src_ptr); + const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr); const int16x8_t a0 = unpack_to_s16_h(va0); const int16x8_t b0 = unpack_to_s16_h(vb0); @@ -131,9 +134,9 @@ static INLINE void variance_inner_32(const uint8_t *a, const uint8_t *b, *sum_squared = ss; } -static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, - int *sum) { +static INLINE void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { int i; int32x4_t s = vec_splat_s32(0); @@ -142,31 +145,31 @@ static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b, switch (w) { case 4: for (i = 0; i < h / 2; ++i) { - const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride)); - const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride)); + const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride)); + const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride)); const int16x8_t d = vec_sub(a0, b0); s = vec_sum4s(d, s); ss = vec_msum(d, d, ss); - a += a_stride * 2; - b += b_stride * 2; + src_ptr += src_stride * 2; + ref_ptr += ref_stride * 2; } break; case 8: for (i = 0; i < h; ++i) { - const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, a)); - const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, b)); + const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr)); + const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr)); const int16x8_t d = vec_sub(a0, b0); s = vec_sum4s(d, s); ss = vec_msum(d, d, ss); - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } break; case 16: for (i = 0; i < h; ++i) { - const uint8x16_t va = vec_vsx_ld(0, a); - const uint8x16_t vb = vec_vsx_ld(0, b); + const uint8x16_t va = vec_vsx_ld(0, src_ptr); + const uint8x16_t vb = vec_vsx_ld(0, ref_ptr); const int16x8_t a0 = unpack_to_s16_h(va); const int16x8_t b0 = unpack_to_s16_h(vb); const int16x8_t a1 = unpack_to_s16_l(va); @@ -179,24 +182,24 @@ static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b, s = vec_sum4s(d1, s); ss = vec_msum(d1, d1, ss); - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } break; case 32: for (i = 0; i < h; ++i) { - variance_inner_32(a, b, &ss, &s); - a += a_stride; - b += b_stride; + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + src_ptr += src_stride; + ref_ptr += ref_stride; } break; case 64: for (i = 0; i < h; ++i) { - variance_inner_32(a, b, &ss, &s); - variance_inner_32(a + 32, b + 32, &ss, &s); + variance_inner_32(src_ptr, ref_ptr, &ss, &s); + variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s); - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } break; } @@ -214,33 +217,33 @@ static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b, * and returns that value using pass-by-reference instead of returning * sse - sum^2 / w*h */ -#define GET_VAR(W, H) \ - void vpx_get##W##x##H##var_vsx(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse, int *sum) { \ - variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ } /* Identical to the variance call except it does not calculate the * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in * variable. */ -#define MSE(W, H) \ - uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse; \ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ } -#define VAR(W, H) \ - uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } #define VARIANCES(W, H) VAR(W, H) diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index a7eef0d..30b55dc 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -21,36 +21,37 @@ static const uint8_t bilinear_filters[8][2] = { { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, }; -uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride) { +uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { int distortion = 0; int r, c; for (r = 0; r < 4; ++r) { for (c = 0; c < 4; ++c) { - int diff = a[c] - b[c]; + int diff = src_ptr[c] - ref_ptr[c]; distortion += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } return distortion; } -uint32_t vpx_get_mb_ss_c(const int16_t *a) { +uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; for (i = 0; i < 256; ++i) { - sum += a[i] * a[i]; + sum += src_ptr[i] * src_ptr[i]; } return sum; } -static void variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, uint32_t *sse, int *sum) { +static void variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, int w, int h, + uint32_t *sse, int *sum) { int i, j; *sum = 0; @@ -58,13 +59,13 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b, for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; + const int diff = src_ptr[j] - ref_ptr[j]; *sum += diff; *sse += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } @@ -76,24 +77,23 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b, // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line, + int pixel_step, unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); - ++a; + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } @@ -106,91 +106,90 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { - b[j] = ROUND_POWER_OF_TWO( - (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); - ++a; + ref_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; } - a += src_pixels_per_line - output_width; - b += output_width; + src_ptr += src_pixels_per_line - output_width; + ref_ptr += output_width; } } -#define VAR(W, H) \ - uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ +#define VAR(W, H) \ + uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t vpx_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse); \ } -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ - \ - return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[x_offset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W); \ + \ + return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse); \ } /* Identical to the variance call except it takes an additional parameter, sum, * and returns that value using pass-by-reference instead of returning * sse - sum^2 / w*h */ -#define GET_VAR(W, H) \ - void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - int *sum) { \ - variance(a, a_stride, b, b_stride, W, H, sse, sum); \ +#define GET_VAR(W, H) \ + void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse, int *sum) { \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \ } /* Identical to the variance call except it does not calculate the * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in * variable. */ -#define MSE(W, H) \ - uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse; \ +#define MSE(W, H) \ + uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \ + return *sse; \ } /* All three forms of the variance are available in the same sizes. */ @@ -237,128 +236,140 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, int64_t *sum) { +static void highbd_variance64(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint64_t *sse, int64_t *sum) { int i, j; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr); + uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr); *sum = 0; *sse = 0; for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { - const int diff = a[j] - b[j]; + const int diff = src_ptr[j] - ref_ptr[j]; *sum += diff; *sse += diff * diff; } - a += a_stride; - b += b_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } -static void highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)sse_long; *sum = (int)sum_long; } -static void highbd_10_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); } -static void highbd_12_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint32_t *sse, int *sum) { +static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, + const uint8_t *ref8_ptr, int ref_stride, int w, + int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; int64_t sum_long = 0; - highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long, + &sum_long); *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } -#define HIGHBD_VAR(W, H) \ - uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - uint32_t *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define HIGHBD_VAR(W, H) \ + uint32_t vpx_highbd_8_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t vpx_highbd_10_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t vpx_highbd_12_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } -#define HIGHBD_GET_VAR(S) \ - void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ - } \ - \ - void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse, int *sum) { \ - highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ +#define HIGHBD_GET_VAR(S) \ + void vpx_highbd_8_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_10_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ + } \ + \ + void vpx_highbd_12_get##S##x##S##var_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse, int *sum) { \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \ + sum); \ } -#define HIGHBD_MSE(W, H) \ - uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ - } \ - \ - uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - uint32_t *sse) { \ - int sum; \ - highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ - return *sse; \ +#define HIGHBD_MSE(W, H) \ + uint32_t vpx_highbd_8_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_10_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ + } \ + \ + uint32_t vpx_highbd_12_mse##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \ + &sum); \ + return *sse; \ } static void highbd_var_filter_block2d_bil_first_pass( @@ -403,111 +414,111 @@ static void highbd_var_filter_block2d_bil_second_pass( } } -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + ref_ptr, ref_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ - temp2, W); \ - \ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ - temp2, W); \ - \ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ - \ - vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ - temp2, W); \ - \ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ + } \ + \ + uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + highbd_var_filter_block2d_bil_first_pass( \ + src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[y_offset]); \ + \ + vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H, \ + temp2, W); \ + \ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref_ptr, ref_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index b67e929..6d0e1b8 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -22,37 +22,38 @@ extern "C" { #define FILTER_BITS 7 #define FILTER_WEIGHT 128 -typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b_ptr, int b_stride); +typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride); -typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, +typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, - int b_stride, int n); +typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride, + uint8_t *ref_ptr, int ref_stride, int n); -typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, +typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, +typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride, const uint8_t *const b_array[], - int b_stride, unsigned int *sad_array); + int ref_stride, unsigned int *sad_array); -typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); +typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, unsigned int *sse); -typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); +typedef unsigned int (*vpx_subpixvariance_fn_t)( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); typedef unsigned int (*vpx_subp_avg_variance_fn_t)( - const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset, - const uint8_t *b_ptr, int b_stride, unsigned int *sse, + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + #if CONFIG_VP8 typedef struct variance_vtable { vpx_sad_fn_t sdf; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 2350bc6..13d4b3d 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1091,70 +1091,70 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq " # # Variance # -add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/; # # Specialty Variance # -add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/; -add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; +add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; specialize qw/vpx_get8x8var sse2 neon msa vsx/; -add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/; -add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/; -add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_mse8x16 sse2 msa mmi vsx/; -add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; +add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_mse8x8 sse2 msa mmi vsx/; add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *"; specialize qw/vpx_get_mb_ss sse2 msa vsx/; -add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; +add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride"; specialize qw/vpx_get4x4sse_cs neon msa vsx/; add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; @@ -1163,218 +1163,218 @@ add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, # # Subpixel Variance # -add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; +add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/; -add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; +add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance64x64 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance64x32 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance32x64 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance32x32 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance32x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance16x32 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance16x8 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance8x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance64x64 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance64x32 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance32x64 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance32x32 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance32x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance16x32 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance16x8 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance8x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_variance8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance64x64 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance64x32 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance32x64 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance32x32 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance32x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance16x32 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance16x8 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance8x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_variance8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_mse16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_8_mse8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_mse16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_10_mse8x8 sse2/; - add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_mse16x16 sse2/; - add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_mse8x8 sse2/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride"; @@ -1382,221 +1382,221 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Subpixel Variance # - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; } # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index db4aaf4..cefde0f 100644 --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -32,12 +32,12 @@ SECTION .text ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *ref, ptrdiff_t ref_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse psubw %3, %4 psubw %1, %2 mova %4, %3 ; make copies to manipulate to calc sum @@ -95,13 +95,13 @@ SECTION .text %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq + ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq %else cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, height, sse + ref, ref_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq @@ -110,14 +110,14 @@ SECTION .text %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse + ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp + %define second_str second_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, height, sse + ref, ref_stride, height, sse %define block_height heightd %endif @@ -142,14 +142,14 @@ SECTION .text %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse + ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp + %define second_str second_stridemp %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, height, sse + ref, ref_stride, height, sse %define block_height heightd %endif @@ -165,7 +165,7 @@ SECTION .text sar block_height, 1 %endif %if %2 == 1 ; avg - shl sec_str, 1 + shl second_str, 1 %endif ; FIXME(rbultje) replace by jumptable? @@ -180,35 +180,35 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m2, [srcq + 16] - mova m1, [dstq] - mova m3, [dstq + 16] + mova m1, [refq] + mova m3, [refq + 16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m2, [secq+16] + pavgw m0, [second_predq] + pavgw m2, [second_predq+16] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m2, [srcq + src_strideq*2] - mova m1, [dstq] - mova m3, [dstq + dst_strideq*2] + mova m1, [refq] + mova m3, [refq + ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -226,40 +226,40 @@ SECTION .text movu m1, [srcq+16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] + mova m2, [refq] + mova m3, [refq+16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] pavgw m0, m1 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -302,8 +302,8 @@ SECTION .text movu m1, [srcq + 16] movu m4, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*2+16] - mova m2, [dstq] - mova m3, [dstq+16] + mova m2, [refq] + mova m3, [refq+16] ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of ; instructions is the same (5), but it is 1 mul instead of 2, so might be @@ -320,23 +320,23 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m5, [srcq+src_strideq*4] mova m4, m1 - mova m2, [dstq] - mova m3, [dstq+dst_strideq*2] + mova m2, [refq] + mova m3, [refq+ref_strideq*2] pmullw m1, filter_y_a pmullw m5, filter_y_b paddw m1, filter_rnd @@ -348,16 +348,16 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -381,41 +381,41 @@ SECTION .text movu m1, [srcq + 16] movu m4, [srcq + 2] movu m5, [srcq + 18] - mova m2, [dstq] - mova m3, [dstq + 16] + mova m2, [refq] + mova m3, [refq + 16] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq + src_strideq*2] movu m4, [srcq + 2] movu m5, [srcq + src_strideq*2 + 2] - mova m2, [dstq] - mova m3, [dstq + dst_strideq*2] + mova m2, [refq] + mova m3, [refq + ref_strideq*2] pavgw m0, m4 pavgw m1, m5 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m2, m1, m3, m6, m7 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -444,20 +444,20 @@ SECTION .text pavgw m3, m5 pavgw m0, m2 pavgw m1, m3 - mova m4, [dstq] - mova m5, [dstq + 16] + mova m4, [refq] + mova m5, [refq + 16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -473,20 +473,20 @@ SECTION .text pavgw m3, m5 pavgw m0, m2 pavgw m2, m3 - mova m4, [dstq] - mova m5, [dstq + dst_strideq*2] + mova m4, [refq] + mova m5, [refq + ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -549,21 +549,21 @@ SECTION .text paddw m0, filter_rnd psrlw m1, 4 paddw m0, m2 - mova m2, [dstq] + mova m2, [refq] psrlw m0, 4 - mova m3, [dstq+16] + mova m3, [refq+16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 lea srcq, [srcq + src_strideq*2] - lea dstq, [dstq + dst_strideq*2] + lea refq, [refq + ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -588,21 +588,21 @@ SECTION .text paddw m0, filter_rnd psrlw m4, 4 paddw m0, m2 - mova m2, [dstq] + mova m2, [refq] psrlw m0, 4 - mova m3, [dstq+dst_strideq*2] + mova m3, [refq+ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 lea srcq, [srcq + src_strideq*4] - lea dstq, [dstq + dst_strideq*4] + lea refq, [refq + ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -651,8 +651,8 @@ SECTION .text movu m1, [srcq+16] movu m2, [srcq+2] movu m3, [srcq+18] - mova m4, [dstq] - mova m5, [dstq+16] + mova m4, [refq] + mova m5, [refq+16] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd @@ -664,23 +664,23 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] movu m1, [srcq+src_strideq*2] movu m2, [srcq+2] movu m3, [srcq+src_strideq*2+2] - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] pmullw m1, filter_x_a pmullw m3, filter_x_b paddw m1, filter_rnd @@ -692,16 +692,16 @@ SECTION .text psrlw m1, 4 psrlw m0, 4 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m1, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m1, [second_predq] %endif SUM_SSE m0, m4, m1, m5, m6, m7 lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] + lea refq, [refq+ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -773,24 +773,24 @@ SECTION .text paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+16] + mova m4, [refq] + mova m5, [refq+16] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m1, m3 %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m4, m1, m5, m6, m7 mova m0, m2 mova m1, m3 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -814,24 +814,24 @@ SECTION .text paddw m3, filter_rnd paddw m2, m4 paddw m3, m5 - mova m4, [dstq] - mova m5, [dstq+dst_strideq*2] + mova m4, [refq] + mova m5, [refq+ref_strideq*2] psrlw m2, 4 psrlw m3, 4 pavgw m0, m2 pavgw m2, m3 %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m2, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m2, [second_predq] %endif SUM_SSE m0, m4, m2, m5, m6, m7 mova m0, m3 lea srcq, [srcq+src_strideq*4] - lea dstq, [dstq+dst_strideq*4] + lea refq, [refq+ref_strideq*4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height @@ -929,23 +929,23 @@ SECTION .text pmullw m3, filter_y_b paddw m0, m2 paddw m1, filter_rnd - mova m2, [dstq] + mova m2, [refq] paddw m1, m3 psrlw m0, 4 psrlw m1, 4 - mova m3, [dstq+16] + mova m3, [refq+16] %if %2 == 1 ; avg - pavgw m0, [secq] - pavgw m1, [secq+16] + pavgw m0, [second_predq] + pavgw m1, [second_predq+16] %endif SUM_SSE m0, m2, m1, m3, m6, m7 mova m0, m4 mova m1, m5 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 2] + lea refq, [refq + ref_strideq * 2] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %else ; %1 < 16 movu m0, [srcq] @@ -983,23 +983,23 @@ SECTION .text pmullw m3, filter_y_b paddw m0, m2 paddw m4, filter_rnd - mova m2, [dstq] + mova m2, [refq] paddw m4, m3 psrlw m0, 4 psrlw m4, 4 - mova m3, [dstq+dst_strideq*2] + mova m3, [refq+ref_strideq*2] %if %2 == 1 ; avg - pavgw m0, [secq] - add secq, sec_str - pavgw m4, [secq] + pavgw m0, [second_predq] + add second_predq, second_str + pavgw m4, [second_predq] %endif SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq + dst_strideq * 4] + lea refq, [refq + ref_strideq * 4] %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif %endif dec block_height diff --git a/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_variance_impl_sse2.asm index e646767..a256a59 100644 --- a/vpx_dsp/x86/highbd_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_variance_impl_sse2.asm @@ -16,9 +16,9 @@ SECTION .text ;unsigned int vpx_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, -; int source_stride, +; int src_stride, ; unsigned char * ref_ptr, -; int recon_stride, +; int ref_stride, ; unsigned int * SSE, ; int * Sum ;) @@ -36,8 +36,8 @@ sym(vpx_highbd_calc16x16var_sse2): mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes @@ -169,9 +169,9 @@ sym(vpx_highbd_calc16x16var_sse2): ;unsigned int vpx_highbd_calc8x8var_sse2 ;( ; unsigned char * src_ptr, -; int source_stride, +; int src_stride, ; unsigned char * ref_ptr, -; int recon_stride, +; int ref_stride, ; unsigned int * SSE, ; int * Sum ;) @@ -189,8 +189,8 @@ sym(vpx_highbd_calc8x8var_sse2): mov rsi, arg(0) ;[src_ptr] mov rdi, arg(2) ;[ref_ptr] - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + movsxd rax, DWORD PTR arg(1) ;[src_stride] + movsxd rdx, DWORD PTR arg(3) ;[ref_stride] add rax, rax ; source stride in bytes add rdx, rdx ; recon stride in bytes diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index a6f7c3d..d08da1c 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -251,7 +251,7 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, #define DECL(w, opt) \ int vpx_highbd_sub_pixel_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + const uint16_t *ref, ptrdiff_t ref_stride, int height, \ unsigned int *sse, void *unused0, void *unused); #define DECLS(opt) \ DECL(8, opt); \ @@ -265,28 +265,28 @@ DECLS(sse2); #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ NULL); \ if (w > wf) { \ unsigned int sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -298,29 +298,29 @@ DECLS(sse2); \ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL, \ NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -335,40 +335,40 @@ DECLS(sse2); \ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) { \ int start_row; \ uint32_t sse; \ int se = 0; \ int64_t var; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL, \ NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \ &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ @@ -404,8 +404,8 @@ FNS(sse2); #define DECL(w, opt) \ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second, \ + ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \ void *unused); #define DECLS(opt1) \ DECL(16, opt1) \ @@ -418,30 +418,30 @@ DECLS(sse2); #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -453,31 +453,31 @@ DECLS(sse2); \ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int64_t var; \ uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \ NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, \ sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, \ sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, \ sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ @@ -492,7 +492,7 @@ DECLS(sse2); \ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr, \ const uint8_t *sec8) { \ int start_row; \ int64_t var; \ @@ -500,34 +500,34 @@ DECLS(sse2); int se = 0; \ uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + ref + (start_row * ref_stride), ref_stride, sec + (start_row * w), \ w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 16 + (start_row * ref_stride), ref_stride, \ sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 32 + (start_row * ref_stride), ref_stride, \ sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + y_offset, ref + 48 + (start_row * ref_stride), ref_stride, \ sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm index 1e5f49b..5adb9b8 100644 --- a/vpx_dsp/x86/subpel_variance_sse2.asm +++ b/vpx_dsp/x86/subpel_variance_sse2.asm @@ -41,12 +41,12 @@ SECTION .text ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *ref, ptrdiff_t ref_stride, ; int height, unsigned int *sse); ; ; This function returns the SE and stores SSE in the given pointer. -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse +%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse psubw %3, %4 psubw %1, %2 paddw %5, %3 @@ -117,12 +117,12 @@ SECTION .text %if ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse + %define second_str second_strideq %else cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ + x_offset, y_offset, ref, ref_stride, \ height, sse %endif %define block_height heightd @@ -131,13 +131,13 @@ SECTION .text %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse + x_offset, y_offset, ref, ref_stride, \ + second_pred, second_stride, height, sse %define block_height dword heightm - %define sec_str sec_stridemp + %define second_str second_stridemp %else cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ + x_offset, y_offset, ref, ref_stride, \ height, sse %define block_height heightd %endif @@ -163,13 +163,13 @@ SECTION .text %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ x_offset, y_offset, \ - dst, dst_stride, sec, sec_stride, \ + ref, ref_stride, second_pred, second_stride, \ height, sse %define block_height dword heightm - %define sec_str sec_stridemp + %define second_str second_stridemp %else cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ + x_offset, y_offset, ref, ref_stride, \ height, sse %define block_height heightd %endif @@ -192,7 +192,7 @@ SECTION .text %if %1 < 16 sar block_height, 1 %if %2 == 1 ; avg - shl sec_str, 1 + shl second_str, 1 %endif %endif @@ -207,9 +207,9 @@ SECTION .text .x_zero_y_zero_loop: %if %1 == 16 movu m0, [srcq] - mova m1, [dstq] + mova m1, [refq] %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m3, m1, m5 punpcklbw m1, m5 %endif @@ -223,7 +223,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] %if %2 == 1 ; avg @@ -237,14 +237,14 @@ SECTION .text movx m2, [srcq+src_strideq] %endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] %if %2 == 1 ; avg %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 %endif punpcklbw m3, m5 @@ -265,10 +265,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_zero_loop @@ -283,11 +283,11 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] - mova m1, [dstq] + mova m1, [refq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 @@ -295,7 +295,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] @@ -306,22 +306,22 @@ SECTION .text movx m1, [srcq+src_strideq*2] punpckldq m2, m1 %endif - movx m1, [dstq] + movx m1, [refq] %if %1 > 4 movlhps m0, m2 %else ; 4xh punpckldq m0, m2 %endif - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m0, m2 punpcklbw m1, m5 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpcklbw m3, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m4, [secq] + movh m4, [second_predq] pavgb m0, m4 punpcklbw m3, m5 punpcklbw m0, m5 @@ -329,9 +329,9 @@ SECTION .text %endif %else ; !avg movx m4, [srcq+src_strideq*2] - movx m1, [dstq] + movx m1, [refq] pavgb m0, m2 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -341,10 +341,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_half_loop @@ -386,7 +386,7 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+src_strideq] - mova m1, [dstq] + mova m1, [refq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 @@ -418,7 +418,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -427,14 +427,14 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) - movx m1, [dstq] + movx m1, [refq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -454,7 +454,7 @@ SECTION .text pmullw m4, filter_y_b paddw m0, m1 paddw m2, filter_rnd - movx m1, [dstq] + movx m1, [refq] paddw m2, m4 %endif psraw m0, 4 @@ -466,11 +466,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -480,10 +480,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_zero_y_other_loop @@ -504,11 +504,11 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m0, m4 punpckhbw m3, m1, m5 %if %2 == 1 ; avg - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpcklbw m1, m5 punpckhbw m2, m0, m5 @@ -516,7 +516,7 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m4, [srcq+1] @@ -530,17 +530,17 @@ SECTION .text movx m2, [srcq+src_strideq+1] punpckldq m4, m2 %endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] pavgb m0, m4 punpcklbw m3, m5 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpcklbw m1, m5 punpckhbw m2, m0, m5 punpcklbw m0, m5 %else ; 4xh - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m1, m5 punpcklbw m0, m5 @@ -548,10 +548,10 @@ SECTION .text %endif %else ; !avg movx m2, [srcq+src_strideq] - movx m1, [dstq] + movx m1, [refq] pavgb m0, m4 movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] pavgb m2, m4 punpcklbw m0, m5 punpcklbw m2, m5 @@ -561,10 +561,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_zero_loop @@ -583,13 +583,13 @@ SECTION .text .x_half_y_half_loop: movu m4, [srcq] movu m3, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m4, m3 punpckhbw m3, m1, m5 pavgb m0, m4 %if %2 == 1 ; avg punpcklbw m1, m5 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else @@ -601,7 +601,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] @@ -628,13 +628,13 @@ SECTION .text punpckldq m0, m2 pshuflw m4, m2, 0xe %endif - movx m1, [dstq] + movx m1, [refq] pavgb m0, m2 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 %endif punpcklbw m3, m5 @@ -653,8 +653,8 @@ SECTION .text pavgb m4, m1 pavgb m0, m2 pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] punpcklbw m0, m5 punpcklbw m2, m5 punpcklbw m3, m5 @@ -664,10 +664,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_half_loop @@ -713,7 +713,7 @@ SECTION .text .x_half_y_other_loop: movu m4, [srcq] movu m2, [srcq+1] - mova m1, [dstq] + mova m1, [refq] pavgb m4, m2 %if cpuflag(ssse3) punpckhbw m2, m0, m4 @@ -743,7 +743,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -752,7 +752,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m3, [srcq+1] @@ -768,9 +768,9 @@ SECTION .text movx m3, [srcq+src_strideq+1] pavgb m2, m1 pavgb m4, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) - movx m1, [dstq] + movx m1, [refq] punpcklbw m0, m2 punpcklbw m2, m4 pmaddubsw m0, filter_y_a @@ -790,7 +790,7 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] %endif psraw m0, 4 psraw m2, 4 @@ -801,11 +801,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -816,10 +816,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_half_y_other_loop @@ -867,7 +867,7 @@ SECTION .text %if %1 == 16 movu m0, [srcq] movu m4, [srcq+1] - mova m1, [dstq] + mova m1, [refq] %if cpuflag(ssse3) punpckhbw m2, m0, m4 punpcklbw m0, m4 @@ -894,7 +894,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -903,16 +903,16 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] movx m2, [srcq+src_strideq] movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %if cpuflag(ssse3) punpcklbw m0, m1 - movx m1, [dstq] + movx m1, [refq] punpcklbw m2, m4 pmaddubsw m0, filter_x_a pmaddubsw m2, filter_x_a @@ -932,7 +932,7 @@ SECTION .text pmullw m4, filter_x_b paddw m0, m1 paddw m2, filter_rnd - movx m1, [dstq] + movx m1, [refq] paddw m2, m4 %endif psraw m0, 4 @@ -944,11 +944,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -958,10 +958,10 @@ SECTION .text SUM_SSE m0, m1, m2, m3, m6, m7 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_zero_loop @@ -1037,7 +1037,7 @@ SECTION .text movu m4, [srcq] movu m3, [srcq+1] %if cpuflag(ssse3) - mova m1, [dstq] + mova m1, [refq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a @@ -1063,7 +1063,7 @@ SECTION .text paddw m2, filter_rnd paddw m4, m3 paddw m2, m1 - mova m1, [dstq] + mova m1, [refq] psraw m4, 4 psraw m2, 4 punpckhbw m3, m1, m5 @@ -1077,7 +1077,7 @@ SECTION .text %endif %if %2 == 1 ; avg ; FIXME(rbultje) pipeline - pavgb m0, [secq] + pavgb m0, [second_predq] %endif punpckhbw m2, m0, m5 punpcklbw m0, m5 @@ -1085,7 +1085,7 @@ SECTION .text mova m0, m4 add srcq, src_strideq - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] @@ -1113,8 +1113,8 @@ SECTION .text punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] + movx m1, [refq] + movx m3, [refq+ref_strideq] paddw m2, filter_rnd paddw m4, filter_rnd %else @@ -1129,9 +1129,9 @@ SECTION .text pmullw m3, filter_x_b paddw m4, filter_rnd paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] paddw m4, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] %endif psraw m2, 4 psraw m4, 4 @@ -1144,11 +1144,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -1160,10 +1160,10 @@ SECTION .text mova m0, m4 lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_half_loop @@ -1254,7 +1254,7 @@ SECTION .text %if cpuflag(ssse3) movu m4, [srcq] movu m3, [srcq+1] - mova m1, [dstq] + mova m1, [refq] punpckhbw m2, m4, m3 punpcklbw m4, m3 pmaddubsw m2, filter_x_a @@ -1300,7 +1300,7 @@ SECTION .text pmullw m0, filter_y_a pmullw m3, filter_y_b paddw m2, m1 - mova m1, [dstq] + mova m1, [refq] paddw m0, filter_rnd psraw m2, 4 paddw m0, m3 @@ -1311,7 +1311,7 @@ SECTION .text %if %2 == 1 ; avg ; FIXME(rbultje) pipeline packuswb m0, m2 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %endif @@ -1319,7 +1319,7 @@ SECTION .text mova m0, m4 INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq + add refq, ref_strideq %else ; %1 < 16 movx m0, [srcq] movx m1, [srcq+1] @@ -1355,8 +1355,8 @@ SECTION .text punpcklbw m4, m3 pmaddubsw m2, filter_x_a pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] + movx m3, [refq+ref_strideq] + movx m1, [refq] paddw m2, filter_rnd paddw m4, filter_rnd psraw m2, 4 @@ -1395,9 +1395,9 @@ SECTION .text pmullw m1, m4, filter_y_b paddw m2, filter_rnd paddw m0, m3 - movx m3, [dstq+dst_strideq] + movx m3, [refq+ref_strideq] paddw m2, m1 - movx m1, [dstq] + movx m1, [refq] psraw m0, 4 psraw m2, 4 punpcklbw m3, m5 @@ -1410,11 +1410,11 @@ SECTION .text %endif packuswb m0, m2 %if %1 > 4 - pavgb m0, [secq] + pavgb m0, [second_predq] punpckhbw m2, m0, m5 punpcklbw m0, m5 %else - movh m2, [secq] + movh m2, [second_predq] pavgb m0, m2 punpcklbw m0, m5 movhlps m2, m0 @@ -1424,10 +1424,10 @@ SECTION .text mova m0, m4 INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] + lea refq, [refq+ref_strideq*2] %endif %if %2 == 1 ; avg - add secq, sec_str + add second_predq, second_str %endif dec block_height jg .x_other_y_other_loop diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index d938b81..9232acb 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -164,11 +164,11 @@ static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, } } -void vpx_get16x16var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum) { +void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { __m256i vsse, vsum; - variance16_avx2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum); } @@ -224,8 +224,9 @@ void vpx_get16x16var_avx2(const uint8_t *src, int src_stride, static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { const __m256i zero_reg = _mm256_setzero_si256(); __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; int i; @@ -233,11 +234,11 @@ static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst); const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src); if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg); @@ -251,9 +252,10 @@ static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride, // (x == 0, y == 4) or (x == 4, y == 0). sstep determines the direction. static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, - int do_sec, int height, __m256i *sum_reg, - __m256i *sse_reg, int sstep) { + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int sstep) { const __m256i zero_reg = _mm256_setzero_si256(); __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; int i; @@ -263,11 +265,11 @@ static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep)); const __m256i src_avg = _mm256_avg_epu8(src_0, src_1); if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg); @@ -280,24 +282,27 @@ static INLINE void spv32_half_zero(const uint8_t *src, int src_stride, static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { - spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, src_stride); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, src_stride); } static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { - spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, 1); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { + spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, 1); } static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i src_a = _mm256_loadu_si256((__m256i const *)src); const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1)); @@ -314,11 +319,11 @@ static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, prev_src_avg = src_avg; if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg); @@ -333,9 +338,10 @@ static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride, // (x == 0, y == bil) or (x == 4, y == bil). sstep determines the direction. static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, - int do_sec, int height, __m256i *sum_reg, - __m256i *sse_reg, int offset, int sstep) { + const uint8_t *second_pred, + int second_stride, int do_sec, int height, + __m256i *sum_reg, __m256i *sse_reg, + int offset, int sstep) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i filter = _mm256_load_si256( @@ -351,10 +357,10 @@ static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, FILTER_SRC(filter) if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); - sec += sec_stride; + second_pred += second_stride; exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); } @@ -366,27 +372,27 @@ static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride, static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int y_offset) { - spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, y_offset, src_stride); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, y_offset, src_stride); } static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int x_offset) { - spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, sum_reg, sse_reg, x_offset, 1); + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { + spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, sum_reg, sse_reg, x_offset, 1); } static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int y_offset) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int y_offset) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i filter = _mm256_load_si256( @@ -408,12 +414,12 @@ static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, FILTER_SRC(filter) if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } CALC_SUM_SSE_INSIDE_LOOP dst += dst_stride; @@ -423,9 +429,9 @@ static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride, static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int x_offset) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i filter = _mm256_load_si256( @@ -456,11 +462,11 @@ static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, src_pack = _mm256_avg_epu8(src_pack, src_reg); if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg); - sec += sec_stride; + second_pred += second_stride; } else { exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg); @@ -474,9 +480,9 @@ static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride, static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, int do_sec, - int height, __m256i *sum_reg, __m256i *sse_reg, - int x_offset, int y_offset) { + const uint8_t *second_pred, int second_stride, + int do_sec, int height, __m256i *sum_reg, + __m256i *sse_reg, int x_offset, int y_offset) { const __m256i zero_reg = _mm256_setzero_si256(); const __m256i pw8 = _mm256_set1_epi16(8); const __m256i xfilter = _mm256_load_si256( @@ -511,12 +517,12 @@ static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, FILTER_SRC(yfilter) if (do_sec) { - const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec); + const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred); const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi); const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg); exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg); exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg); - sec += sec_stride; + second_pred += second_stride; } prev_src_pack = src_pack; @@ -530,7 +536,7 @@ static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride, static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, int x_offset, int y_offset, const uint8_t *dst, int dst_stride, - const uint8_t *sec, int sec_stride, + const uint8_t *second_pred, int second_stride, int do_sec, int height, unsigned int *sse) { const __m256i zero_reg = _mm256_setzero_si256(); __m256i sum_reg = _mm256_setzero_si256(); @@ -540,44 +546,44 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride, // x_offset = 0 and y_offset = 0 if (x_offset == 0) { if (y_offset == 0) { - spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 0 and y_offset = 4 } else if (y_offset == 4) { - spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 0 and y_offset = bilin interpolation } else { - spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, y_offset); + spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); } // x_offset = 4 and y_offset = 0 } else if (x_offset == 4) { if (y_offset == 0) { - spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 4 and y_offset = 4 } else if (y_offset == 4) { - spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg); + spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg); // x_offset = 4 and y_offset = bilin interpolation } else { - spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, y_offset); + spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, y_offset); } // x_offset = bilin interpolation and y_offset = 0 } else { if (y_offset == 0) { - spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, x_offset); + spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); // x_offset = bilin interpolation and y_offset = 4 } else if (y_offset == 4) { - spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, x_offset); + spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset); // x_offset = bilin interpolation and y_offset = bilin interpolation } else { - spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec, - height, &sum_reg, &sse_reg, x_offset, y_offset); + spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride, + do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset); } } CALC_SUM_AND_SSE @@ -593,63 +599,63 @@ static unsigned int sub_pixel_variance32xh_avx2( static unsigned int sub_pixel_avg_variance32xh_avx2( const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, - int height, unsigned int *sse) { + const uint8_t *dst, int dst_stride, const uint8_t *second_pred, + int second_stride, int height, unsigned int *sse) { return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride, - sec, sec_stride, 1, height, sse); + second_pred, second_stride, 1, height, sse); } -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -unsigned int vpx_variance16x8_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; - variance16_avx2(src, src_stride, ref, ref_stride, 8, &vsse, &vsum); + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 7); } -unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; - variance16_avx2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } -unsigned int vpx_variance16x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; - variance16_avx2(src, src_stride, ref, ref_stride, 32, &vsse, &vsum); + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } -unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; - variance32_avx2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 9); } -unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; __m128i vsum_128; - variance32_avx2(src, src_stride, ref, ref_stride, 32, &vsse, &vsum); + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum), _mm256_extractf128_si256(vsum, 1)); vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), @@ -658,13 +664,13 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride, return *sse - (uint32_t)(((int64_t)sum * sum) >> 10); } -unsigned int vpx_variance32x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; __m128i vsum_128; - variance32_avx2(src, src_stride, ref, ref_stride, 64, &vsse, &vsum); + variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum); vsum = sum_to_32bit_avx2(vsum); vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), _mm256_extractf128_si256(vsum, 1)); @@ -672,14 +678,14 @@ unsigned int vpx_variance32x64_avx2(const uint8_t *src, int src_stride, return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m256i vsse = _mm256_setzero_si256(); __m256i vsum = _mm256_setzero_si256(); __m128i vsum_128; int sum; - variance64_avx2(src, src_stride, ref, ref_stride, 32, &vsse, &vsum); + variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); vsum = sum_to_32bit_avx2(vsum); vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), _mm256_extractf128_si256(vsum, 1)); @@ -687,8 +693,8 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, return *sse - (uint32_t)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m256i vsse = _mm256_setzero_si256(); __m256i vsum = _mm256_setzero_si256(); @@ -698,8 +704,9 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, for (i = 0; i < 2; i++) { __m256i vsum16; - variance64_avx2(src + 32 * i * src_stride, src_stride, - ref + 32 * i * ref_stride, ref_stride, 32, &vsse, &vsum16); + variance64_avx2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); } vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum), @@ -708,63 +715,61 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride, return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } -unsigned int vpx_mse16x8_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; - variance16_avx2(src, src_stride, ref, ref_stride, 8, &vsse, &vsum); + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse; } -unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { int sum; __m256i vsse, vsum; - variance16_avx2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); return *sse; } -unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { +unsigned int vpx_sub_pixel_variance64x64_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { unsigned int sse1; const int se1 = sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1); unsigned int sse2; const int se2 = - sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); + sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset, + ref_ptr + 32, ref_stride, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; return *sse - (uint32_t)(((int64_t)se * se) >> 12); } -unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { +unsigned int vpx_sub_pixel_variance32x32_avx2( + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { const int se = sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } unsigned int vpx_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { unsigned int sse1; - const int se1 = sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); + const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 64, 64, &sse1); unsigned int sse2; const int se2 = sub_pixel_avg_variance32xh_avx2( - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, - 64, 64, &sse2); + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride, + second_pred + 32, 64, 64, &sse2); const int se = se1 + se2; *sse = sse1 + sse2; @@ -773,10 +778,12 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2( } unsigned int vpx_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, + const uint8_t *second_pred) { // Process 32 elements in parallel. - const int se = sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); + const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset, + y_offset, ref_ptr, ref_stride, + second_pred, 32, 32, sse); return *sse - (uint32_t)(((int64_t)se * se) >> 10); } diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index 0279052..37ef64e 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -22,14 +22,14 @@ static INLINE unsigned int add32x4_sse2(__m128i val) { return _mm_cvtsi128_si32(val); } -unsigned int vpx_get_mb_ss_sse2(const int16_t *src) { +unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); + const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); - src += 8; + src_ptr += 8; } return add32x4_sse2(vsum); @@ -42,10 +42,11 @@ static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { return _mm_unpacklo_epi8(p01, _mm_setzero_si128()); } -static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, +static INLINE void variance_kernel_sse2(const __m128i src_ptr, + const __m128i ref_ptr, __m128i *const sse, __m128i *const sum) { - const __m128i diff = _mm_sub_epi16(src, ref); + const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr); *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); *sum = _mm_add_epi16(*sum, diff); } @@ -99,8 +100,8 @@ static INLINE int sum_final_sse2(const __m128i sum) { return add32x4_sse2(t); } -static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, +static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { int i; @@ -110,17 +111,17 @@ static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, *sum = _mm_setzero_si128(); for (i = 0; i < h; i += 2) { - const __m128i s = load4x2_sse2(src, src_stride); - const __m128i r = load4x2_sse2(ref, ref_stride); + const __m128i s = load4x2_sse2(src_ptr, src_stride); + const __m128i r = load4x2_sse2(ref_ptr, ref_stride); variance_kernel_sse2(s, r, sse, sum); - src += 2 * src_stride; - ref += 2 * ref_stride; + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; } } -static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, +static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { const __m128i zero = _mm_setzero_si128(); @@ -132,23 +133,23 @@ static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, for (i = 0; i < h; i++) { const __m128i s = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src), zero); + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero); const __m128i r = - _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref), zero); + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero); variance_kernel_sse2(s, r, sse, sum); - src += src_stride; - ref += ref_stride; + src_ptr += src_stride; + ref_ptr += ref_stride; } } -static INLINE void variance16_kernel_sse2(const uint8_t *const src, - const uint8_t *const ref, +static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr, + const uint8_t *const ref_ptr, __m128i *const sse, __m128i *const sum) { const __m128i zero = _mm_setzero_si128(); - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); + const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr); const __m128i src0 = _mm_unpacklo_epi8(s, zero); const __m128i ref0 = _mm_unpacklo_epi8(r, zero); const __m128i src1 = _mm_unpackhi_epi8(s, zero); @@ -158,8 +159,8 @@ static INLINE void variance16_kernel_sse2(const uint8_t *const src, variance_kernel_sse2(src1, ref1, sse, sum); } -static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, +static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { int i; @@ -169,14 +170,14 @@ static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, *sum = _mm_setzero_si128(); for (i = 0; i < h; ++i) { - variance16_kernel_sse2(src, ref, sse, sum); - src += src_stride; - ref += ref_stride; + variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; } } -static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, +static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { int i; @@ -186,15 +187,15 @@ static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, *sum = _mm_setzero_si128(); for (i = 0; i < h; ++i) { - variance16_kernel_sse2(src + 0, ref + 0, sse, sum); - variance16_kernel_sse2(src + 16, ref + 16, sse, sum); - src += src_stride; - ref += ref_stride; + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; } } -static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, - const uint8_t *ref, const int ref_stride, +static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride, + const uint8_t *ref_ptr, const int ref_stride, const int h, __m128i *const sse, __m128i *const sum) { int i; @@ -204,135 +205,136 @@ static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, *sum = _mm_setzero_si128(); for (i = 0; i < h; ++i) { - variance16_kernel_sse2(src + 0, ref + 0, sse, sum); - variance16_kernel_sse2(src + 16, ref + 16, sse, sum); - variance16_kernel_sse2(src + 32, ref + 32, sse, sum); - variance16_kernel_sse2(src + 48, ref + 48, sse, sum); - src += src_stride; - ref += ref_stride; + variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum); + variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum); + variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum); + variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum); + src_ptr += src_stride; + ref_ptr += ref_stride; } } -void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, - int ref_stride, unsigned int *sse, int *sum) { +void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { __m128i vsse, vsum; - variance8_sse2(src, src_stride, ref, ref_stride, 8, &vsse, &vsum); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, sum); } -void vpx_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum) { +void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { __m128i vsse, vsum; - variance16_sse2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_256_pel_sse2(vsse, vsum, sse, sum); } -unsigned int vpx_variance4x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance4_sse2(src, src_stride, ref, ref_stride, 4, &vsse, &vsum); + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 4); } -unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance4_sse2(src, src_stride, ref, ref_stride, 8, &vsse, &vsum); + variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 5); } -unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance8_sse2(src, src_stride, ref, ref_stride, 4, &vsse, &vsum); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 5); } -unsigned int vpx_variance8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance8_sse2(src, src_stride, ref, ref_stride, 8, &vsse, &vsum); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 6); } -unsigned int vpx_variance8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance8_sse2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 7); } -unsigned int vpx_variance16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance16_sse2(src, src_stride, ref, ref_stride, 8, &vsse, &vsum); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); variance_final_128_pel_sse2(vsse, vsum, sse, &sum); return *sse - ((sum * sum) >> 7); } -unsigned int vpx_variance16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance16_sse2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_256_pel_sse2(vsse, vsum, sse, &sum); return *sse - (uint32_t)(((int64_t)sum * sum) >> 8); } -unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse, vsum; int sum; - variance16_sse2(src, src_stride, ref, ref_stride, 32, &vsse, &vsum); + variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); variance_final_512_pel_sse2(vsse, vsum, sse, &sum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); } -unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse = _mm_setzero_si128(); __m128i vsum; int sum; - variance32_sse2(src, src_stride, ref, ref_stride, 16, &vsse, &vsum); + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); variance_final_512_pel_sse2(vsse, vsum, sse, &sum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); } -unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse = _mm_setzero_si128(); __m128i vsum; int sum; - variance32_sse2(src, src_stride, ref, ref_stride, 32, &vsse, &vsum); + variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum); *sse = add32x4_sse2(vsse); sum = sum_final_sse2(vsum); return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); } -unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); @@ -341,8 +343,9 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, for (i = 0; i < 2; i++) { __m128i vsum16; - variance32_sse2(src + 32 * i * src_stride, src_stride, - ref + 32 * i * ref_stride, ref_stride, 32, &vsse, &vsum16); + variance32_sse2(src_ptr + 32 * i * src_stride, src_stride, + ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse, + &vsum16); vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); @@ -350,8 +353,8 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride, return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); @@ -360,8 +363,9 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, for (i = 0; i < 2; i++) { __m128i vsum16; - variance64_sse2(src + 16 * i * src_stride, src_stride, - ref + 16 * i * ref_stride, ref_stride, 16, &vsse, &vsum16); + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); @@ -369,8 +373,8 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride, return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); } -unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { __m128i vsse = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); @@ -379,8 +383,9 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, for (i = 0; i < 4; i++) { __m128i vsum16; - variance64_sse2(src + 16 * i * src_stride, src_stride, - ref + 16 * i * ref_stride, ref_stride, 16, &vsse, &vsum16); + variance64_sse2(src_ptr + 16 * i * src_stride, src_stride, + ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse, + &vsum16); vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); } *sse = add32x4_sse2(vsse); @@ -388,41 +393,41 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride, return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); } -unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } -unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, +unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { - vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse); return *sse; } // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm -#define DECL(w, opt) \ - int vpx_sub_pixel_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ - void *unused0, void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) #define DECLS(opt1, opt2) \ DECL(4, opt1); \ DECL(8, opt1); \ @@ -433,36 +438,37 @@ DECLS(ssse3, ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h, \ + &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } #define FNS(opt1, opt2) \ @@ -487,12 +493,12 @@ FNS(ssse3, ssse3); #undef FN // The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ - int vpx_sub_pixel_avg_variance##w##xh_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ - ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ - void *unused) +#define DECL(w, opt) \ + int vpx_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset, \ + int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, \ + const uint8_t *second_pred, ptrdiff_t second_stride, int height, \ + unsigned int *sse, void *unused0, void *unused) #define DECLS(opt1, opt2) \ DECL(4, opt1); \ DECL(8, opt1); \ @@ -503,37 +509,38 @@ DECLS(ssse3, ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ + const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + unsigned int sse_tmp; \ + int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, \ + second_pred, w, h, &sse_tmp, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16, \ + ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, \ + ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48, \ + ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse_tmp += sse2; \ + } \ + } \ + *sse = sse_tmp; \ + return sse_tmp - \ + (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } #define FNS(opt1, opt2) \ -- 2.7.4