clang-tidy: fix vpx_dsp parameters
authorJohann <johann.koenig@duck.com>
Tue, 30 Oct 2018 19:59:46 +0000 (12:59 -0700)
committerJohann <johann.koenig@duck.com>
Thu, 1 Nov 2018 19:14:14 +0000 (12:14 -0700)
BUG=webm:1444

Change-Id: Iee19be068afc6c81396c79218a89c469d2e66207

29 files changed:
vpx_dsp/arm/quantize_neon.c
vpx_dsp/arm/sad4d_neon.c
vpx_dsp/arm/sad_neon.c
vpx_dsp/bitwriter.h
vpx_dsp/deblock.c
vpx_dsp/fwd_txfm.c
vpx_dsp/inv_txfm.c
vpx_dsp/loopfilter.c
vpx_dsp/mips/deblock_msa.c
vpx_dsp/quantize.c
vpx_dsp/quantize.h
vpx_dsp/sad.c
vpx_dsp/subtract.c
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/x86/avg_pred_sse2.c
vpx_dsp/x86/convolve.h
vpx_dsp/x86/highbd_convolve_avx2.c
vpx_dsp/x86/highbd_intrapred_sse2.asm
vpx_dsp/x86/highbd_loopfilter_sse2.c
vpx_dsp/x86/loopfilter_avx2.c
vpx_dsp/x86/loopfilter_sse2.c
vpx_dsp/x86/quantize_avx.c
vpx_dsp/x86/quantize_sse2.c
vpx_dsp/x86/quantize_ssse3.c
vpx_dsp/x86/quantize_x86.h
vpx_dsp/x86/sad4d_avx2.c
vpx_dsp/x86/sad4d_avx512.c
vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm

index a0a1e6d..1e33851 100644 (file)
@@ -20,12 +20,12 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -38,8 +38,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
     const int16x8_t dequant = vld1q_s16(dequant_ptr);
     // Add one because the eob does not index from 0.
-    const uint16x8_t iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -65,10 +65,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     qcoeff = vandq_s16(qcoeff, zbin_mask);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
     coeff_ptr += 8;
-    iscan_ptr += 8;
+    iscan += 8;
 
     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
@@ -90,8 +90,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
     do {
       // Add one because the eob is not its index.
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -118,10 +118,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
       coeff_ptr += 8;
-      iscan_ptr += 8;
+      iscan += 8;
 
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
@@ -150,17 +150,19 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
 
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  (void)scan_ptr;
+  (void)scan;
   (void)n_coeffs;  // Because we will always calculate 32*32.
   (void)skip_block;
   assert(!skip_block);
@@ -174,8 +176,8 @@ void vpx_quantize_b_32x32_neon(
     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
     const int16x8_t dequant = vld1q_s16(dequant_ptr);
     // Add one because the eob does not index from 0.
-    const uint16x8_t iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -203,10 +205,10 @@ void vpx_quantize_b_32x32_neon(
     qcoeff = vandq_s16(qcoeff, zbin_mask);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
     coeff_ptr += 8;
-    iscan_ptr += 8;
+    iscan += 8;
 
     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
@@ -234,8 +236,8 @@ void vpx_quantize_b_32x32_neon(
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
       // Add one because the eob is not its index.
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -264,10 +266,10 @@ void vpx_quantize_b_32x32_neon(
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
       coeff_ptr += 8;
-      iscan_ptr += 8;
+      iscan += 8;
 
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
index 535ec0f..06443c6 100644 (file)
@@ -28,24 +28,25 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
   return vreinterpret_u8_u32(aa);
 }
 
-static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
-                            const uint8_t *const ref[4], const int ref_stride,
-                            const int height, uint32_t *const res) {
+static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
+                            const uint8_t *const ref_array[4],
+                            const int ref_stride, const int height,
+                            uint32_t *const res) {
   int i;
   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
   uint16x4_t a[2];
   uint32x4_t r;
 
-  assert(!((intptr_t)src % sizeof(uint32_t)));
+  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
   assert(!(src_stride % sizeof(uint32_t)));
 
   for (i = 0; i < height; ++i) {
     const uint8x8_t s = vreinterpret_u8_u32(
-        vld1_dup_u32((const uint32_t *)(src + i * src_stride)));
-    const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,
-                                                     ref[1] + i * ref_stride);
-    const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,
-                                                     ref[3] + i * ref_stride);
+        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
+    const uint8x8_t ref01 = load_unaligned_2_buffers(
+        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
+    const uint8x8_t ref23 = load_unaligned_2_buffers(
+        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
     abs[0] = vabal_u8(abs[0], s, ref01);
     abs[1] = vabal_u8(abs[1], s, ref23);
   }
@@ -56,16 +57,16 @@ static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
   vst1q_u32(res, r);
 }
 
-void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  sad4x_4d(src, src_stride, ref, ref_stride, 4, res);
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
 }
 
-void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  sad4x_4d(src, src_stride, ref, ref_stride, 8, res);
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -137,17 +138,18 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
   vst1q_u32(res, vcombine_u32(d0, d1));
 }
 
-static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
                             uint32_t *res, const int height) {
   int i, j;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vld1_u8(src);
-    src += src_stride;
+    const uint8x8_t s = vld1_u8(src_ptr);
+    src_ptr += src_stride;
     for (j = 0; j < 4; ++j) {
       const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
       ref_loop[j] += ref_stride;
@@ -158,44 +160,45 @@ static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
   sad_512_pel_final_neon(sum, res);
 }
 
-void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
 }
 
-void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
 }
 
-void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
-                         const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,
+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint16x8_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref);
-  *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));
-  *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));
+  const uint8x16_t r = vld1q_u8(ref_ptr);
+  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
+  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
 }
 
-static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
-                             const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
                              uint32_t *res, const int height) {
   int i, j;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src);
-    src += src_stride;
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
     for (j = 0; j < 4; ++j) {
       sad16_neon(ref_loop[j], s, &sum[j]);
       ref_loop[j] += ref_stride;
@@ -205,50 +208,51 @@ static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
   sad_512_pel_final_neon(sum, res);
 }
 
-void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
-                         const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
 }
 
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
 }
 
-void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
-                             const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
                              const int height, uint16x8_t *const sum) {
   int i;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
 
   sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
     uint8x16_t s;
 
-    s = vld1q_u8(src + 0 * 16);
+    s = vld1q_u8(src_ptr + 0 * 16);
     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
 
-    s = vld1q_u8(src + 1 * 16);
+    s = vld1q_u8(src_ptr + 1 * 16);
     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
 
-    src += src_stride;
+    src_ptr += src_stride;
     ref_loop[0] += ref_stride;
     ref_loop[1] += ref_stride;
     ref_loop[2] += ref_stride;
@@ -256,68 +260,69 @@ static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
   }
 }
 
-void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
   uint16x8_t sum[4];
-  sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
   sad_512_pel_final_neon(sum, res);
 }
 
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
   uint16x8_t sum[4];
-  sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
   sad_1024_pel_final_neon(sum, res);
 }
 
-void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
   uint16x8_t sum[4];
-  sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
   sad_2048_pel_final_neon(sum, res);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
   int i;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
   for (i = 0; i < 32; ++i) {
     uint8x16_t s;
 
-    s = vld1q_u8(src + 0 * 16);
+    s = vld1q_u8(src_ptr + 0 * 16);
     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
 
-    s = vld1q_u8(src + 1 * 16);
+    s = vld1q_u8(src_ptr + 1 * 16);
     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
 
-    s = vld1q_u8(src + 2 * 16);
+    s = vld1q_u8(src_ptr + 2 * 16);
     sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
     sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
     sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
 
-    s = vld1q_u8(src + 3 * 16);
+    s = vld1q_u8(src_ptr + 3 * 16);
     sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
     sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
     sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
 
-    src += src_stride;
+    src_ptr += src_stride;
     ref_loop[0] += ref_stride;
     ref_loop[1] += ref_stride;
     ref_loop[2] += ref_stride;
@@ -327,11 +332,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
   sad_2048_pel_final_neon(sum, res);
 }
 
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
   int i;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0), vdupq_n_u16(0) };
@@ -339,31 +345,31 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
   for (i = 0; i < 64; ++i) {
     uint8x16_t s;
 
-    s = vld1q_u8(src + 0 * 16);
+    s = vld1q_u8(src_ptr + 0 * 16);
     sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
     sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
     sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
 
-    s = vld1q_u8(src + 1 * 16);
+    s = vld1q_u8(src_ptr + 1 * 16);
     sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
     sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
     sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
     sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
 
-    s = vld1q_u8(src + 2 * 16);
+    s = vld1q_u8(src_ptr + 2 * 16);
     sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
     sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
     sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
     sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
 
-    s = vld1q_u8(src + 3 * 16);
+    s = vld1q_u8(src_ptr + 3 * 16);
     sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
     sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
     sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
     sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
 
-    src += src_stride;
+    src_ptr += src_stride;
     ref_loop[0] += ref_stride;
     ref_loop[1] += ref_stride;
     ref_loop[2] += ref_stride;
index 9518a16..1ce66d3 100644 (file)
@@ -73,128 +73,132 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
 }
 
-static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, const int height) {
+static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    const uint8x8_t b_u8 = vld1_u8(b);
-    a += a_stride;
-    b += b_stride;
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs = vabal_u8(abs, a_u8, b_u8);
   }
   return abs;
 }
 
-static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   const uint8_t *second_pred,
+                                   const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    const uint8x8_t b_u8 = vld1_u8(b);
-    const uint8x8_t c_u8 = vld1_u8(c);
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t c_u8 = vld1_u8(second_pred);
     const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
-    a += a_stride;
-    b += b_stride;
-    c += 8;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
     abs = vabal_u8(abs, a_u8, avg);
   }
   return abs;
 }
 
-#define sad8xN(n)                                                      \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \
-                               const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
-  }                                                                    \
-                                                                       \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                   const uint8_t *ref, int ref_stride, \
-                                   const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                             \
-        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
+#define sad8xN(n)                                                              \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *ref_ptr, int ref_stride) {       \
+    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
+                                   const uint8_t *ref_ptr, int ref_stride,     \
+                                   const uint8_t *second_pred) {               \
+    const uint16x8_t abs =                                                     \
+        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
   }
 
 sad8xN(4);
 sad8xN(8);
 sad8xN(16);
 
-static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    const uint8x16_t b_u8 = vld1q_u8(b);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
   }
   return abs;
 }
 
-static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    const uint8x16_t b_u8 = vld1q_u8(b);
-    const uint8x16_t c_u8 = vld1q_u8(c);
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t c_u8 = vld1q_u8(second_pred);
     const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
-    a += a_stride;
-    b += b_stride;
-    c += 16;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
   }
   return abs;
 }
 
-#define sad16xN(n)                                                      \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                              \
-        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+#define sad16xN(n)                                                            \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint16x8_t abs =                                                    \
+        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint16x8_t abs =                                                    \
+        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
   }
 
 sad16xN(8);
 sad16xN(16);
 sad16xN(32);
 
-static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(a);
-    const uint8x16_t a_hi = vld1q_u8(a + 16);
-    const uint8x16_t b_lo = vld1q_u8(b);
-    const uint8x16_t b_hi = vld1q_u8(b + 16);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
@@ -203,24 +207,25 @@ static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
   return abs;
 }
 
-static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(a);
-    const uint8x16_t a_hi = vld1q_u8(a + 16);
-    const uint8x16_t b_lo = vld1q_u8(b);
-    const uint8x16_t b_hi = vld1q_u8(b + 16);
-    const uint8x16_t c_lo = vld1q_u8(c);
-    const uint8x16_t c_hi = vld1q_u8(c + 16);
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t c_lo = vld1q_u8(second_pred);
+    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
     const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
     const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
-    a += a_stride;
-    b += b_stride;
-    c += 32;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
@@ -229,43 +234,44 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
   return abs;
 }
 
-#define sad32xN(n)                                                      \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                              \
-        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+#define sad32xN(n)                                                            \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint16x8_t abs =                                                    \
+        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint16x8_t abs =                                                    \
+        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
   }
 
 sad32xN(16);
 sad32xN(32);
 sad32xN(64);
 
-static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
   int i;
   uint16x8_t abs_0 = vdupq_n_u16(0);
   uint16x8_t abs_1 = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    const uint8x16_t b_0 = vld1q_u8(b);
-    const uint8x16_t b_1 = vld1q_u8(b + 16);
-    const uint8x16_t b_2 = vld1q_u8(b + 32);
-    const uint8x16_t b_3 = vld1q_u8(b + 48);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
@@ -282,33 +288,34 @@ static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
   }
 }
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
   int i;
   uint16x8_t abs_0 = vdupq_n_u16(0);
   uint16x8_t abs_1 = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    const uint8x16_t b_0 = vld1q_u8(b);
-    const uint8x16_t b_1 = vld1q_u8(b + 16);
-    const uint8x16_t b_2 = vld1q_u8(b + 32);
-    const uint8x16_t b_3 = vld1q_u8(b + 48);
-    const uint8x16_t c_0 = vld1q_u8(c);
-    const uint8x16_t c_1 = vld1q_u8(c + 16);
-    const uint8x16_t c_2 = vld1q_u8(c + 32);
-    const uint8x16_t c_3 = vld1q_u8(c + 48);
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t c_0 = vld1q_u8(second_pred);
+    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
     const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
     const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
     const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
     const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
-    a += a_stride;
-    b += b_stride;
-    c += 64;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
@@ -325,19 +332,20 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
   }
 }
 
-#define sad64xN(n)                                                      \
-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint32x4_t abs =                                              \
-        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+#define sad64xN(n)                                                            \
+  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t abs =                                                    \
+        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t abs =                                                    \
+        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
   }
 
 sad64xN(32);
index ec3975e..11579c9 100644 (file)
@@ -27,8 +27,8 @@ typedef struct vpx_writer {
   uint8_t *buffer;
 } vpx_writer;
 
-void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
-void vpx_stop_encode(vpx_writer *bc);
+void vpx_start_encode(vpx_writer *br, uint8_t *source);
+void vpx_stop_encode(vpx_writer *br);
 
 static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   unsigned int split;
index 94acbb3..455b73b 100644 (file)
@@ -39,11 +39,10 @@ const int16_t vpx_rv[] = {
   9,  10, 13,
 };
 
-void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
-                                            unsigned char *dst_ptr,
-                                            int src_pixels_per_line,
-                                            int dst_pixels_per_line, int cols,
-                                            unsigned char *f, int size) {
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,
+                                            unsigned char *dst, int src_pitch,
+                                            int dst_pitch, int cols,
+                                            unsigned char *flimits, int size) {
   unsigned char *p_src, *p_dst;
   int row;
   int col;
@@ -55,19 +54,21 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
 
   for (row = 0; row < size; row++) {
     /* post_proc_down for one row */
-    p_src = src_ptr;
-    p_dst = dst_ptr;
+    p_src = src;
+    p_dst = dst;
 
     for (col = 0; col < cols; col++) {
-      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
-      unsigned char p_above1 = p_src[col - src_pixels_per_line];
-      unsigned char p_below1 = p_src[col + src_pixels_per_line];
-      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+      unsigned char p_above2 = p_src[col - 2 * src_pitch];
+      unsigned char p_above1 = p_src[col - src_pitch];
+      unsigned char p_below1 = p_src[col + src_pitch];
+      unsigned char p_below2 = p_src[col + 2 * src_pitch];
 
       v = p_src[col];
 
-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
-          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+      if ((abs(v - p_above2) < flimits[col]) &&
+          (abs(v - p_above1) < flimits[col]) &&
+          (abs(v - p_below1) < flimits[col]) &&
+          (abs(v - p_below2) < flimits[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_above2 + p_above1 + 1) >> 1;
         k2 = (p_below2 + p_below1 + 1) >> 1;
@@ -79,8 +80,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     }
 
     /* now post_proc_across */
-    p_src = dst_ptr;
-    p_dst = dst_ptr;
+    p_src = dst;
+    p_dst = dst;
 
     p_src[-2] = p_src[-1] = p_src[0];
     p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
@@ -88,10 +89,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     for (col = 0; col < cols; col++) {
       v = p_src[col];
 
-      if ((abs(v - p_src[col - 2]) < f[col]) &&
-          (abs(v - p_src[col - 1]) < f[col]) &&
-          (abs(v - p_src[col + 1]) < f[col]) &&
-          (abs(v - p_src[col + 2]) < f[col])) {
+      if ((abs(v - p_src[col - 2]) < flimits[col]) &&
+          (abs(v - p_src[col - 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 2]) < flimits[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
         k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@@ -109,8 +110,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     p_dst[col - 1] = d[(col - 1) & 3];
 
     /* next row */
-    src_ptr += src_pixels_per_line;
-    dst_ptr += dst_pixels_per_line;
+    src += src_pitch;
+    dst += dst_pitch;
   }
 }
 
index 6dcb3ba..ef66de0 100644 (file)
@@ -87,11 +87,11 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   output[0] = sum * 2;
 }
 
-void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
   tran_low_t intermediate[64];
   int pass;
-  tran_low_t *output = intermediate;
+  tran_low_t *out = intermediate;
   const tran_low_t *in = NULL;
 
   // Transform columns
@@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
+      out[0] = (tran_low_t)fdct_round_shift(t0);
+      out[2] = (tran_low_t)fdct_round_shift(t2);
+      out[4] = (tran_low_t)fdct_round_shift(t1);
+      out[6] = (tran_low_t)fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
@@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
+      out[1] = (tran_low_t)fdct_round_shift(t0);
+      out[3] = (tran_low_t)fdct_round_shift(t2);
+      out[5] = (tran_low_t)fdct_round_shift(t1);
+      out[7] = (tran_low_t)fdct_round_shift(t3);
+      out += 8;
     }
     in = intermediate;
-    output = final_output;
+    out = output;
   }
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+    for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;
   }
 }
 
@@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
@@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
+      output[j + i * 32] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
   }
 }
@@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
@@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
       //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+    for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];
   }
 }
 
@@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
   vpx_fdct4x4_c(input, output, stride);
 }
 
-void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,
                           int stride) {
-  vpx_fdct8x8_c(input, final_output, stride);
+  vpx_fdct8x8_c(input, output, stride);
 }
 
-void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,
                             int stride) {
-  vpx_fdct8x8_1_c(input, final_output, stride);
+  vpx_fdct8x8_1_c(input, output, stride);
 }
 
 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
@@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
   vpx_fdct16x16_1_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  vpx_fdct32x32_c(input, out, stride);
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vpx_fdct32x32_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,
                                int stride) {
-  vpx_fdct32x32_rd_c(input, out, stride);
+  vpx_fdct32x32_rd_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,
                               int stride) {
-  vpx_fdct32x32_1_c(input, out, stride);
+  vpx_fdct32x32_1_c(input, output, stride);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
index 0194aa1..69de05e 100644 (file)
@@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
   tran_low_t *op = tmp;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1346,12 +1346,12 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
   }
 }
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int bd) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
   tran_low_t *op = tmp;
   (void)bd;
 
index 9866ea3..47f30c9 100644 (file)
@@ -109,29 +109,30 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
 
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-            s + 1 * p, s + 2 * p, s + 3 * p);
+    filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
+            s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int count) {
   int i;
@@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 =
-        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
-             s + 7 * p);
+    const int8_t flat2 = flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
+
+    filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+             s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+             s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
+             s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
+             s + 7 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
 }
 
-void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
 }
 
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
   int i;
@@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
     filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
              s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
              s + 7);
-    s += p;
+    s += pitch;
   }
 }
 
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
 }
 
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
 }
 
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
   int i;
@@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
+                   s + 1 * pitch, bd);
     ++s;
   }
 }
 
 void vpx_highbd_lpf_horizontal_4_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                   p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                   q3 = s[3 * pitch];
 
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
+                   s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                   s + 2 * pitch, s + 3 * pitch, bd);
     ++s;
   }
 }
 
 void vpx_highbd_lpf_horizontal_8_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
                                             const uint8_t *blimit,
                                             const uint8_t *limit,
                                             const uint8_t *thresh, int count,
@@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8 * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat2 =
-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
-                    s + 6 * p, s + 7 * p, bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+                    s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+                    s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                    s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
+                    s + 6 * pitch, s + 7 * pitch, bd);
     ++s;
   }
 }
 
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
 }
 
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int count,
@@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
     highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
                     s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
                     s + 5, s + 6, s + 7, bd);
-    s += p;
+    s += pitch;
   }
 }
 
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
index 9ef0483..1707d32 100644 (file)
@@ -508,11 +508,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
   }
 }
 
-void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
-                                   int32_t rows, int32_t cols, int32_t flimit) {
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,
+                                   int32_t cols, int32_t flimit) {
   int32_t row, col, cnt;
-  uint8_t *src_dup = src_ptr;
-  v16u8 src0, src, tmp_orig;
+  uint8_t *src_dup = src;
+  v16u8 src0, src1, tmp_orig;
   v16u8 tmp = { 0 };
   v16i8 zero = { 0 };
   v8u16 sum_h, src_r_h, src_l_h;
@@ -531,13 +531,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
     src_dup[cols + 16] = src_dup[cols - 1];
     tmp_orig = (v16u8)__msa_ldi_b(0);
     tmp_orig[15] = tmp[15];
-    src = LD_UB(src_dup - 8);
-    src[15] = 0;
-    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+    src1 = LD_UB(src_dup - 8);
+    src1[15] = 0;
+    ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
     src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
     sum_sq = HADD_SW_S32(src_r_w) + 16;
-    sum_h = __msa_hadd_u_h(src, src);
+    sum_h = __msa_hadd_u_h(src1, src1);
     sum = HADD_UH_U32(sum_h);
     {
       v16u8 src7, src8, src_r, src_l;
@@ -566,8 +566,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
           sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
         }
         sum = sum_l[7];
-        src = LD_UB(src_dup + 16 * col);
-        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src1 = LD_UB(src_dup + 16 * col);
+        ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
         src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
         src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
         tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
@@ -613,7 +613,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
         total3 = (total3 < flimit_vec);
         PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
         mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
-        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+        tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);
 
         if (col == 0) {
           uint64_t src_d;
index e37ca92..82a6595 100644 (file)
@@ -17,7 +17,7 @@
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                     const int16_t dequant, uint16_t *eob_ptr) {
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
   const int coeff_sign = (coeff >> 31);
@@ -31,7 +31,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
     tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
     if (tmp) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -41,7 +41,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
                             const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
                             uint16_t *eob_ptr) {
   int eob = -1;
 
@@ -55,7 +55,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
     const int64_t tmp = abs_coeff + round_ptr[0];
     const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -65,7 +65,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
 void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                           const int16_t dequant, uint16_t *eob_ptr) {
   const int n_coeffs = 1024;
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
@@ -81,7 +81,7 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                 INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
     if (tmp) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -92,8 +92,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
+                                  const int16_t dequant, uint16_t *eob_ptr) {
   const int n_coeffs = 1024;
   int eob = -1;
 
@@ -107,7 +106,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
     const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
index 94c8206..7cac140 100644 (file)
@@ -19,26 +19,25 @@ extern "C" {
 #endif
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+                     const int16_t dequant, uint16_t *eob_ptr);
 void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+                           const int16_t dequant, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
                             uint16_t *eob_ptr);
 void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
+                                  const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
+                                  const int16_t dequant, uint16_t *eob_ptr);
 #endif
 
 #ifdef __cplusplus
index d4a5329..873ddca 100644 (file)
 #include "vpx_ports/mem.h"
 
 /* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
-#define sadMxN(m, n)                                                        \
-  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                         \
-    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
+#define sadMxN(m, n)                                                          \
+  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);               \
+  }                                                                           \
+  unsigned int vpx_sad##m##x##n##_avg_c(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
+    return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }
 
 // depending on call sites, pass **ref_array to avoid & in subsequent call and
 // de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+#define sadMxNxK(m, n, k)                                                     \
+  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *ref_ptr, int ref_stride,     \
+                                  uint32_t *sad_array) {                      \
+    int i;                                                                    \
+    for (i = 0; i < k; ++i)                                                   \
+      sad_array[i] =                                                          \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
   }
 
 // This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n)                                                    \
-  void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i)                                                \
-      sad_array[i] =                                                       \
-          vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+#define sadMxNx4D(m, n)                                                        \
+  void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *const ref_array[],               \
+                               int ref_stride, uint32_t *sad_array) {          \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i)                                                    \
+      sad_array[i] =                                                           \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
   }
 
 /* clang-format off */
@@ -133,60 +134,61 @@ sadMxNx4D(4, 4)
 
 #if CONFIG_VP9_HIGHBITDEPTH
         static INLINE
-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
+    unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,
+                            const uint8_t *ref8_ptr, int ref_stride, int width,
+                            int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
+                                       const uint16_t *ref_ptr, int ref_stride,
                                        int width, int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
 #define highbd_sadMxN(m, n)                                                    \
-  unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref,                 \
-                                           int ref_stride) {                   \
-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  unsigned int vpx_highbd_sad##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride) {                                                        \
+    return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);         \
   }                                                                            \
   unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred) {                                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, const uint8_t *second_pred) {                            \
     DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]);                           \
     vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
-                               n, CONVERT_TO_SHORTPTR(ref), ref_stride);       \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+                               n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
+    return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
   }
 
-#define highbd_sadMxNx4D(m, n)                                               \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
+#define highbd_sadMxNx4D(m, n)                                                \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+                                      const uint8_t *const ref_array[],       \
+                                      int ref_stride, uint32_t *sad_array) {  \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \
+                                                 ref_array[i], ref_stride);   \
+    }                                                                         \
   }
 
 /* clang-format off */
index 95e7071..45c819e 100644 (file)
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
-                          ptrdiff_t diff_stride, const uint8_t *src,
-                          ptrdiff_t src_stride, const uint8_t *pred,
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                          ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                          ptrdiff_t src_stride, const uint8_t *pred_ptr,
                           ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+    for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];
 
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
+    diff_ptr += diff_stride;
+    pred_ptr += pred_stride;
+    src_ptr += src_stride;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
-                                 ptrdiff_t diff_stride, const uint8_t *src8,
-                                 ptrdiff_t src_stride, const uint8_t *pred8,
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                                 ptrdiff_t diff_stride, const uint8_t *src8_ptr,
+                                 ptrdiff_t src_stride, const uint8_t *pred8_ptr,
                                  ptrdiff_t pred_stride, int bd) {
   int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
   (void)bd;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
+      diff_ptr[c] = src[c] - pred[c];
     }
 
-    diff += diff_stride;
+    diff_ptr += diff_stride;
     pred += pred_stride;
     src += src_stride;
   }
index 13d4b3d..f4f89b9 100644 (file)
@@ -37,322 +37,322 @@ if ($opts{arch} eq "x86_64") {
 # Intra prediction
 #
 
-add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_4x4 sse2/;
 
-add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
 
-add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_4x4 ssse3/;
 
-add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
-add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_4x4 ssse3/;
 
-add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
 
-add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
 
-add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_8x8 ssse3/;
 
-add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;
 
-add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;
 
-add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
 
-add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_8x8 ssse3/;
 
-add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_16x16 ssse3/;
 
-add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
 
-add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
 
-add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_16x16 ssse3/;
 
-add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_32x32 ssse3/;
 
-add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
 
-add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;
 
-add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_32x32 ssse3/;
 
-add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
 
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -400,28 +400,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Sub Pixel Filters
   #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -888,43 +888,43 @@ specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
@@ -945,7 +945,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Block subtraction
   #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
 
   #
   # Single block SAD
@@ -990,13 +990,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Avg
   #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
   specialize qw/vpx_highbd_avg_8x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
   specialize qw/vpx_highbd_avg_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg sse2/;
@@ -1038,43 +1038,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad64x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad64x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad32x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad32x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad32x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad16x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad16x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad16x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad8x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad8x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad8x4x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad4x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad4x4x4d sse2/;
 
   #
@@ -1610,7 +1610,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
     specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
 
-    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";
     specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
index e7db755..e4e1e0e 100644 (file)
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/mem_sse2.h"
 
-void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
                             int height, const uint8_t *ref, int ref_stride) {
-  /* comp and pred must be 16 byte aligned. */
-  assert(((intptr_t)comp & 0xf) == 0);
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
   assert(((intptr_t)pred & 0xf) == 0);
   if (width > 8) {
     int x, y;
@@ -27,17 +27,17 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
         const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
         const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
         const __m128i avg = _mm_avg_epu8(p, r);
-        _mm_store_si128((__m128i *)(comp + x), avg);
+        _mm_store_si128((__m128i *)(comp_pred + x), avg);
       }
-      comp += width;
+      comp_pred += width;
       pred += width;
       ref += ref_stride;
     }
   } else {  // width must be 4 or 8.
     int i;
-    // Process 16 elements at a time. comp and pred have width == stride and
-    // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all
-    // divisible by 16 so just ref needs to be massaged when loading.
+    // Process 16 elements at a time. comp_pred and pred have width == stride
+    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+    // all divisible by 16 so just ref needs to be massaged when loading.
     for (i = 0; i < width * height; i += 16) {
       const __m128i p = _mm_load_si128((const __m128i *)pred);
       __m128i r;
@@ -60,10 +60,10 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
         ref += 2 * ref_stride;
       }
       avg = _mm_avg_epu8(p, r);
-      _mm_store_si128((__m128i *)comp, avg);
+      _mm_store_si128((__m128i *)comp_pred, avg);
 
       pred += 16;
-      comp += 16;
+      comp_pred += 16;
     }
   }
 }
index aa60f44..8398ec3 100644 (file)
@@ -23,59 +23,59 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
 #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
-    const int16_t *filter = filter_kernel[offset];                           \
+    const int16_t *filter_row = filter[offset];                              \
     (void)x0_q4;                                                             \
     (void)x_step_q4;                                                         \
     (void)y0_q4;                                                             \
     (void)y_step_q4;                                                         \
-    assert(filter[3] != 128);                                                \
+    assert(filter_row[3] != 128);                                            \
     assert(step_q4 == 16);                                                   \
-    if (filter[0] | filter[1] | filter[6] | filter[7]) {                     \
+    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
         vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
-    } else if (filter[2] | filter[5]) {                                      \
+    } else if (filter_row[2] | filter_row[5]) {                              \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
         vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
         vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
     } else {                                                                 \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
         vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
         vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
     }                                                                        \
   }
@@ -121,86 +121,86 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \
-  void vpx_highbd_convolve8_##name##_##opt(                                   \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
-      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
-    const int16_t *filter = filter_kernel[offset];                            \
-    if (step_q4 == 16 && filter[3] != 128) {                                  \
-      if (filter[0] | filter[1] | filter[6] | filter[7]) {                    \
-        while (w >= 16) {                                                     \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 16;                                                          \
-          dst += 16;                                                          \
-          w -= 16;                                                            \
-        }                                                                     \
-        while (w >= 8) {                                                      \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 8;                                                           \
-          dst += 8;                                                           \
-          w -= 8;                                                             \
-        }                                                                     \
-        while (w >= 4) {                                                      \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 4;                                                           \
-          dst += 4;                                                           \
-          w -= 4;                                                             \
-        }                                                                     \
-      } else if (filter[2] | filter[5]) {                                     \
-        while (w >= 16) {                                                     \
-          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 16;                                                          \
-          dst += 16;                                                          \
-          w -= 16;                                                            \
-        }                                                                     \
-        while (w >= 8) {                                                      \
-          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 8;                                                           \
-          dst += 8;                                                           \
-          w -= 8;                                                             \
-        }                                                                     \
-        while (w >= 4) {                                                      \
-          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 4;                                                           \
-          dst += 4;                                                           \
-          w -= 4;                                                             \
-        }                                                                     \
-      } else {                                                                \
-        while (w >= 16) {                                                     \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
-          src += 16;                                                          \
-          dst += 16;                                                          \
-          w -= 16;                                                            \
-        }                                                                     \
-        while (w >= 8) {                                                      \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
-          src += 8;                                                           \
-          dst += 8;                                                           \
-          w -= 8;                                                             \
-        }                                                                     \
-        while (w >= 4) {                                                      \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
-          src += 4;                                                           \
-          dst += 4;                                                           \
-          w -= 4;                                                             \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    if (w) {                                                                  \
-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
-                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
-                                      y_step_q4, w, h, bd);                   \
-    }                                                                         \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)  \
+  void vpx_highbd_convolve8_##name##_##opt(                                \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,            \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,         \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {     \
+    const int16_t *filter_row = filter[offset];                            \
+    if (step_q4 == 16 && filter_row[3] != 128) {                           \
+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+        while (w >= 16) {                                                  \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else if (filter_row[2] | filter_row[5]) {                          \
+        while (w >= 16) {                                                  \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else {                                                             \
+        while (w >= 16) {                                                  \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+    if (w) {                                                               \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,    \
+                                      filter, x0_q4, x_step_q4, y0_q4,     \
+                                      y_step_q4, w, h, bd);                \
+    }                                                                      \
   }
 
 #define HIGH_FUN_CONV_2D(avg, opt)                                             \
index ff5ef5f..0ffa7f2 100644 (file)
@@ -20,7 +20,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *filter, int x0_q4,
                                    int x_step_q4, int y0_q4, int y_step_q4,
-                                   int width, int h, int bd) {
+                                   int w, int h, int bd) {
   (void)filter;
   (void)x0_q4;
   (void)x_step_q4;
@@ -28,8 +28,8 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
   (void)bd;
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
     do {
       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
@@ -43,7 +43,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 16) {  // width = 32
+  } else if (w > 16) {  // w = 32
     do {
       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
@@ -53,7 +53,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 8) {  // width = 16
+  } else if (w > 8) {  // w = 16
     __m256i p0, p1;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -67,7 +67,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h -= 2;
     } while (h > 0);
-  } else if (width > 4) {  // width = 8
+  } else if (w > 4) {  // w = 8
     __m128i p0, p1;
     do {
       p0 = _mm_loadu_si128((const __m128i *)src);
@@ -81,7 +81,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h -= 2;
     } while (h > 0);
-  } else {  // width = 4
+  } else {  // w = 4
     __m128i p0, p1;
     do {
       p0 = _mm_loadl_epi64((const __m128i *)src);
@@ -102,7 +102,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int x0_q4,
                                   int x_step_q4, int y0_q4, int y_step_q4,
-                                  int width, int h, int bd) {
+                                  int w, int h, int bd) {
   (void)filter;
   (void)x0_q4;
   (void)x_step_q4;
@@ -110,8 +110,8 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
   (void)bd;
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
     __m256i p0, p1, p2, p3, u0, u1, u2, u3;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -130,7 +130,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 16) {  // width = 32
+  } else if (w > 16) {  // w = 32
     __m256i p0, p1, u0, u1;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -143,7 +143,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 8) {  // width = 16
+  } else if (w > 8) {  // w = 16
     __m256i p0, p1, u0, u1;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -158,7 +158,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride << 1;
       h -= 2;
     } while (h > 0);
-  } else if (width > 4) {  // width = 8
+  } else if (w > 4) {  // w = 8
     __m128i p0, p1, u0, u1;
     do {
       p0 = _mm_loadu_si128((const __m128i *)src);
@@ -172,7 +172,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride << 1;
       h -= 2;
     } while (h > 0);
-  } else {  // width = 4
+  } else {  // w = 4
     __m128i p0, p1, u0, u1;
     do {
       p0 = _mm_loadl_epi64((const __m128i *)src);
index c61b621..caf506a 100644 (file)
@@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
   psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
   pcmpeqw               m2, m2
@@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
   movd                  m1, [aboveq-2]
   mova                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   pxor                  m3, m3
   pxor                  m4, m4
   pinsrw                m3, oned, 0
-  pinsrw                m4, bpsd, 0
+  pinsrw                m4, bdd, 0
   pshuflw               m3, m3, 0x0
   DEFINE_ARGS dst, stride, line, left
   punpcklqdq            m3, m3
@@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   pshuflw               m2, m2, 0x0
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
   punpcklqdq            m2, m2
   psllw                 m3, m4
   pcmpeqw               m5, m5
@@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]
@@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
   pshuflw               m0, m0, 0x0
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m5, m5
-  movd                  m6, bpsd
+  movd                  m6, bdd
   psllw                 m5, m6
   pcmpeqw               m7, m7
   pxor                  m6, m6         ; min possible value
index ec22db9..f7fb40d 100644 (file)
@@ -47,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
 // TODO(debargha, peter): Break up large functions into smaller ones
 // in this file.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
-                                       const uint8_t *_blimit,
-                                       const uint8_t *_limit,
-                                       const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
   __m128i ps1, qs1, ps0, qs0;
@@ -70,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   __m128i eight, four;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
   }
 
-  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
-  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
-  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
-  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
 
   //  highbd_filter_mask
   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
@@ -111,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
 
   //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   work = _mm_max_epi16(
       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
@@ -132,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
 
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
 
   // lp filter
@@ -207,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   // (because, in both vars, each block of 16 either all 1s or all 0s)
   flat = _mm_and_si128(flat, mask);
 
-  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
-  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
-  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
-  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
-  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
-  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+  p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
 
   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
   // but referred to as p0-p4 & q0-q4 in fn)
@@ -389,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q6 = _mm_and_si128(flat2, flat2_q6);
   //  get values for when (flat2 && flat && mask)
   q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
-  _mm_store_si128((__m128i *)(s - 7 * p), p6);
-  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+  _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+  _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
 
   p5 = _mm_andnot_si128(flat2, p5);
   //  p5 remains unchanged if !(flat2 && flat && mask)
@@ -404,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   //  get values for when (flat2 && flat && mask)
   q5 = _mm_or_si128(q5, flat2_q5);
   //  full list of q5 values
-  _mm_store_si128((__m128i *)(s - 6 * p), p5);
-  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+  _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+  _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
 
   p4 = _mm_andnot_si128(flat2, p4);
   //  p4 remains unchanged if !(flat2 && flat && mask)
@@ -417,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q4 = _mm_and_si128(flat2, flat2_q4);
   //  get values for when (flat2 && flat && mask)
   q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
-  _mm_store_si128((__m128i *)(s - 5 * p), p4);
-  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+  _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+  _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
 
   p3 = _mm_andnot_si128(flat2, p3);
   //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -430,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q3 = _mm_and_si128(flat2, flat2_q3);
   //  get values for when (flat2 && flat && mask)
   q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
-  _mm_store_si128((__m128i *)(s - 4 * p), p3);
-  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+  _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+  _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
 
   p2 = _mm_andnot_si128(flat2, p2);
   //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -444,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q2 = _mm_and_si128(flat2, flat2_q2);
   //  get values for when (flat2 && flat && mask)
   q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
 
   p1 = _mm_andnot_si128(flat2, p1);
   //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -457,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q1 = _mm_and_si128(flat2, flat2_q1);
   //  get values for when (flat2 && flat && mask)
   q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
 
   p0 = _mm_andnot_si128(flat2, p0);
   //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -470,22 +470,22 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q0 = _mm_and_si128(flat2, flat2_q0);
   //  get values for when (flat2 && flat && mask)
   q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
 }
 
-void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
-  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int bd) {
+  vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
 }
 
-void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -493,16 +493,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i ffff = _mm_cmpeq_epi16(one, one);
   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -519,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   __m128i filter1, filter2;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
     t80 = _mm_set1_epi16(0x80);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
     t80 = _mm_set1_epi16(0x200);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
     t80 = _mm_set1_epi16(0x800);
   }
 
@@ -553,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   mask = _mm_max_epi16(abs_p1p0, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   mask = _mm_max_epi16(abs_q1q0, mask);
@@ -576,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // flat_mask4
@@ -674,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   q1 = _mm_and_si128(flat, q1);
   q1 = _mm_or_si128(work_a, q1);
 
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
   q2 = _mm_load_si128((__m128i *)flat_oq2);
   work_a = _mm_andnot_si128(flat, work_a);
   q2 = _mm_and_si128(flat, q2);
@@ -694,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   p1 = _mm_and_si128(flat, p1);
   p1 = _mm_or_si128(work_a, p1);
 
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
   p2 = _mm_load_si128((__m128i *)flat_op2);
   work_a = _mm_andnot_si128(flat, work_a);
   p2 = _mm_and_si128(flat, p2);
   p2 = _mm_or_si128(work_a, p2);
 
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
 }
 
 void vpx_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
   const __m128i abs_p1p0 =
       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
   const __m128i abs_q1q0 =
@@ -760,33 +760,33 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i filter1, filter2;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
     t80 = _mm_set1_epi16(0x80);
     tff80 = _mm_set1_epi16(0xff80);
     tffe0 = _mm_set1_epi16(0xffe0);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
@@ -794,23 +794,23 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
   }
 
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
 
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   mask = _mm_max_epi16(flat, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   // mask |= (abs(q1 - q0) > limit) * -1;
@@ -822,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // filter4
@@ -872,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
                       t80);
 
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
 }
 
 void vpx_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
@@ -998,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
@@ -1009,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1018,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[0] = s - 4;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
 }
 
 void vpx_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1030,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
   uint16_t *dst[2];
 
   // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1038,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
@@ -1055,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1064,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[0] = s - 4;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
 }
 
 void vpx_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1076,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
   uint16_t *dst[2];
 
   // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1085,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
   src[1] = t_dst + 8;
 
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
@@ -1104,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[1] = t_dst + 8 * 8;
 
   // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose(src, pitch, dst, 8, 2);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
@@ -1115,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[1] = s;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
+  highbd_transpose(src, 8, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
 
   //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
 
   //  Loop filtering
   vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
                                          thresh, bd);
 
   //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+                       pitch);
 }
index 6652a62..85a7314 100644 (file)
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
 
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
   q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
   q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
   q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
   q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
   q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
@@ -59,12 +59,12 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     abs_p1q1 =
         _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
       q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
 
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
       q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
 
       flat2 = _mm_max_epu8(
           _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
 
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
       q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
 
       work = _mm_max_epu8(
           _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
@@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -367,10 +367,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
@@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
   __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
       p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
 
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
-  p256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
-  p256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
-  p256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
-  p256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
-  p256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
-  q256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
-  q256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
-  q256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
-  q256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
-  q256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+  p256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+  p256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+  p256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+  p256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+  p256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+  q256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+  q256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+  q256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+  q256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+  q256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
 
   p4 = _mm256_castsi256_si128(p256_4);
   p3 = _mm256_castsi256_si128(p256_3);
@@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
     __m128i work;
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(flat, mask);
@@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
       flat = _mm_and_si128(flat, mask);
 
       p256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
       q256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
       p5 = _mm256_castsi256_si128(p256_5);
       q5 = _mm256_castsi256_si128(q256_5);
       flat2 = _mm_max_epu8(
@@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
 
       flat2 = _mm_max_epu8(work, flat2);
       p256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
       q256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
       p6 = _mm256_castsi256_si128(p256_6);
       q6 = _mm256_castsi256_si128(q256_6);
       work = _mm_max_epu8(
@@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
       flat2 = _mm_max_epu8(work, flat2);
 
       p256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
       q256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
       p7 = _mm256_castsi256_si128(p256_7);
       q7 = _mm256_castsi256_si128(q256_7);
       work = _mm_max_epu8(
@@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
     p6 = _mm_andnot_si128(flat2, p6);
     flat2_p6 = _mm_and_si128(flat2, flat2_p6);
     p6 = _mm_or_si128(flat2_p6, p6);
-    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+    _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
 
     p5 = _mm_andnot_si128(flat2, p5);
     flat2_p5 = _mm_and_si128(flat2, flat2_p5);
     p5 = _mm_or_si128(flat2_p5, p5);
-    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+    _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
 
     p4 = _mm_andnot_si128(flat2, p4);
     flat2_p4 = _mm_and_si128(flat2, flat2_p4);
     p4 = _mm_or_si128(flat2_p4, p4);
-    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+    _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
 
     p3 = _mm_andnot_si128(flat2, p3);
     flat2_p3 = _mm_and_si128(flat2, flat2_p3);
     p3 = _mm_or_si128(flat2_p3, p3);
-    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+    _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
 
     p2 = _mm_andnot_si128(flat2, p2);
     flat2_p2 = _mm_and_si128(flat2, flat2_p2);
     p2 = _mm_or_si128(flat2_p2, p2);
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
 
     p1 = _mm_andnot_si128(flat2, p1);
     flat2_p1 = _mm_and_si128(flat2, flat2_p1);
     p1 = _mm_or_si128(flat2_p1, p1);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
 
     p0 = _mm_andnot_si128(flat2, p0);
     flat2_p0 = _mm_and_si128(flat2, flat2_p0);
     p0 = _mm_or_si128(flat2_p0, p0);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
 
     q0 = _mm_andnot_si128(flat2, q0);
     flat2_q0 = _mm_and_si128(flat2, flat2_q0);
     q0 = _mm_or_si128(flat2_q0, q0);
-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
 
     q1 = _mm_andnot_si128(flat2, q1);
     flat2_q1 = _mm_and_si128(flat2, flat2_q1);
     q1 = _mm_or_si128(flat2_q1, q1);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
 
     q2 = _mm_andnot_si128(flat2, q2);
     flat2_q2 = _mm_and_si128(flat2, flat2_q2);
     q2 = _mm_or_si128(flat2_q2, q2);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
 
     q3 = _mm_andnot_si128(flat2, q3);
     flat2_q3 = _mm_and_si128(flat2, flat2_q3);
     q3 = _mm_or_si128(flat2_q3, q3);
-    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+    _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
 
     q4 = _mm_andnot_si128(flat2, q4);
     flat2_q4 = _mm_and_si128(flat2, flat2_q4);
     q4 = _mm_or_si128(flat2_q4, q4);
-    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+    _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
 
     q5 = _mm_andnot_si128(flat2, q5);
     flat2_q5 = _mm_and_si128(flat2, flat2_q5);
     q5 = _mm_or_si128(flat2_q5, q5);
-    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+    _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
 
     q6 = _mm_andnot_si128(flat2, q6);
     flat2_q6 = _mm_and_si128(flat2, flat2_q6);
     q6 = _mm_or_si128(flat2_q6, q6);
-    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+    _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
   }
 }
index 853c4d2..20dcb0d 100644 (file)
@@ -31,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
     hev =                                                                     \
         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_cmpgt_epi16(hev, thresh_v);                                     \
     hev = _mm_packs_epi16(hev, hev);                                          \
                                                                               \
     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
@@ -52,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     flat = _mm_max_epu8(work, flat);                                          \
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
     mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_subs_epu8(mask, limit_v);                                      \
     mask = _mm_cmpeq_epi8(mask, zero);                                        \
     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
   } while (0)
@@ -104,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
   } while (0)
 
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
@@ -133,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
   FILTER_HEV_MASK;
   FILTER4;
 
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+  _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0);               // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0);               // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0));  // *oq1
 }
 
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i x0, x1, x2, x3;
   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
 
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
 
   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
 
   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
 
   // Transpose 8x8
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
@@ -213,52 +211,52 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  storeu_uint32(s + 0 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 1 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 2 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 3 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  storeu_uint32(s + 4 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 5 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 6 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 7 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat, flat2;
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
   q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
   q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
   q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
   q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
   q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
@@ -270,12 +268,12 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -285,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -343,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
       q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
 
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
       q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
 
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
       q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
@@ -521,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -592,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat, flat2;
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
@@ -610,22 +608,22 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
 
   __m128i max_abs_p1p0q1q0;
 
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
 
   {
     const __m128i abs_p1p0 = abs_diff(p1, p0);
@@ -639,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
@@ -649,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     mask = _mm_max_epu8(work, mask);
     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -695,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     oq0 = _mm_xor_si128(q0, t80);
     oq1 = _mm_xor_si128(q1, t80);
 
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
 
@@ -852,82 +850,82 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
 
       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+      _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
 
       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+      _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
 
       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+      _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
 
       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+      _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
 
       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+      _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
 
       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
 
       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+      _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+      _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+      _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+      _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   }
 }
 
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+                               const unsigned char *blimit,
+                               const unsigned char *limit,
+                               const unsigned char *thresh) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -935,21 +933,21 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
@@ -965,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -980,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
 
     // flat_mask4
@@ -998,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     unsigned char *src = s;
     {
       __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
 
       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1051,13 +1057,13 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     const __m128i t80 = _mm_set1_epi8(0x80);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1103,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
@@ -1121,27 +1127,25 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
   }
 }
 
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
+void vpx_lpf_horizontal_8_dual_sse2(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -1150,26 +1154,26 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
   const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
 
   __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
   {
     const __m128i abs_p1p0 =
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
@@ -1228,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
 
     do {
       __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
 
       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1287,13 +1299,13 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1345,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
     q2 = _mm_load_si128((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
@@ -1363,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
     p2 = _mm_load_si128((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
   }
 }
 
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+                                    const unsigned char *blimit0,
+                                    const unsigned char *limit0,
+                                    const unsigned char *thresh0,
+                                    const unsigned char *blimit1,
+                                    const unsigned char *limit1,
+                                    const unsigned char *thresh1) {
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
   const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
   const __m128i zero = _mm_set1_epi16(0);
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i mask, hev, flat;
 
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 
   // filter_mask and hev_mask
   {
@@ -1456,13 +1468,13 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1507,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
 
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
   }
 }
 
@@ -1650,7 +1662,7 @@ static INLINE void transpose(unsigned char *src[], int in_p,
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
@@ -1659,7 +1671,7 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   unsigned char *dst[2];
 
   // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
@@ -1667,13 +1679,13 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh) {
@@ -1685,7 +1697,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  transpose(src, p, dst, 8, 1);
+  transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
@@ -1694,10 +1706,10 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   dst[0] = s - 4;
 
   // Transpose back
-  transpose(src, 8, dst, p, 1);
+  transpose(src, 8, dst, pitch, 1);
 }
 
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
@@ -1706,7 +1718,7 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   unsigned char *dst[2];
 
   // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
@@ -1715,13 +1727,13 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   src[1] = t_dst + 8;
 
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1735,7 +1747,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   dst[1] = t_dst + 8 * 8;
 
   // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose(src, pitch, dst, 8, 2);
 
   // Loop filtering
   vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
@@ -1746,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   dst[1] = s;
 
   // Transpose back
-  transpose(src, 8, dst, p, 2);
+  transpose(src, 8, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
 
   // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
 
   // Loop filtering
   vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
 
   // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
 }
index 6f44890..692cf59 100644 (file)
@@ -24,8 +24,8 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         const int16_t *round_ptr, const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan_ptr,
-                        const int16_t *iscan_ptr) {
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
@@ -37,7 +37,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -97,8 +97,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(coeff0, dqcoeff_ptr);
     store_tran_low(coeff1, dqcoeff_ptr + 8);
 
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -141,20 +140,22 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(coeff0, dqcoeff_ptr + index);
     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
@@ -167,7 +168,7 @@ void vpx_quantize_b_32x32_avx(
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)n_coeffs;
   (void)skip_block;
   assert(!skip_block);
@@ -253,8 +254,7 @@ void vpx_quantize_b_32x32_avx(
     store_tran_low(coeff0, dqcoeff_ptr);
     store_tran_low(coeff1, dqcoeff_ptr + 8);
 
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -306,8 +306,8 @@ void vpx_quantize_b_32x32_avx(
     store_tran_low(coeff0, dqcoeff_ptr + index);
     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
index c020b39..327fb05 100644 (file)
@@ -22,8 +22,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
 
@@ -33,7 +33,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -81,8 +81,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   store_tran_low(coeff0, dqcoeff_ptr);
   store_tran_low(coeff1, dqcoeff_ptr + 8);
 
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -115,8 +114,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(coeff0, dqcoeff_ptr + index);
     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
index 3f528e1..d8b6dc7 100644 (file)
@@ -22,7 +22,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+                          const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
 
@@ -32,7 +32,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -74,8 +74,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   store_tran_low(coeff0, dqcoeff_ptr);
   store_tran_low(coeff1, dqcoeff_ptr + 8);
 
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -106,8 +105,8 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(coeff0, dqcoeff_ptr + index);
     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
@@ -116,12 +115,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   int index;
@@ -133,7 +134,7 @@ void vpx_quantize_b_32x32_ssse3(
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)n_coeffs;
   (void)skip_block;
   assert(!skip_block);
@@ -226,8 +227,7 @@ void vpx_quantize_b_32x32_ssse3(
     store_tran_low(coeff0, dqcoeff_ptr);
     store_tran_low(coeff1, dqcoeff_ptr + 8);
 
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -283,8 +283,8 @@ void vpx_quantize_b_32x32_ssse3(
     store_tran_low(coeff0, dqcoeff_ptr + index);
     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
index bb9e32f..a6e2f27 100644 (file)
@@ -48,17 +48,17 @@ static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
   return _mm_mullo_epi16(qcoeff, dequant);
 }
 
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
+// zbin to add 1 to the index in 'scan'.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
                                    const __m128i zbin_mask0,
                                    const __m128i zbin_mask1,
-                                   const int16_t *scan_ptr, const int index,
+                                   const int16_t *scan, const int index,
                                    const __m128i zero) {
   const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
   const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
-  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
-  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
   __m128i eob0, eob1;
   // Add one to convert from indices to counts
   scan0 = _mm_sub_epi16(scan0, zbin_mask0);
index 2c6b36f..b18fecf 100644 (file)
 #include "vpx/vpx_integer.h"
 
 static INLINE void calc_final(const __m256i *const sums /*[4]*/,
-                              uint32_t res[4]) {
+                              uint32_t sad_array[4]) {
   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
   const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
                                     _mm256_extractf128_si256(t2, 1));
-  _mm_storeu_si128((__m128i *)res, sum);
+  _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
 
-  refs[0] = ref[0];
-  refs[1] = ref[1];
-  refs[2] = ref[2];
-  refs[3] = ref[3];
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
   sums[0] = _mm256_setzero_si256();
   sums[1] = _mm256_setzero_si256();
   sums[2] = _mm256_setzero_si256();
@@ -40,46 +40,46 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
   for (i = 0; i < 32; i++) {
     __m256i r[4];
 
-    // load src and all refs
-    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
     r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
     r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
     r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
     r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
 
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
     r[0] = _mm256_sad_epu8(r[0], s);
     r[1] = _mm256_sad_epu8(r[1], s);
     r[2] = _mm256_sad_epu8(r[2], s);
     r[3] = _mm256_sad_epu8(r[3], s);
 
-    // sum every ref-i
+    // sum every ref[]
     sums[0] = _mm256_add_epi32(sums[0], r[0]);
     sums[1] = _mm256_add_epi32(sums[1], r[1]);
     sums[2] = _mm256_add_epi32(sums[2], r[2]);
     sums[3] = _mm256_add_epi32(sums[3], r[3]);
 
-    src += src_stride;
+    src_ptr += src_stride;
     refs[0] += ref_stride;
     refs[1] += ref_stride;
     refs[2] += ref_stride;
     refs[3] += ref_stride;
   }
 
-  calc_final(sums, res);
+  calc_final(sums, sad_array);
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
 
-  refs[0] = ref[0];
-  refs[1] = ref[1];
-  refs[2] = ref[2];
-  refs[3] = ref[3];
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
   sums[0] = _mm256_setzero_si256();
   sums[1] = _mm256_setzero_si256();
   sums[2] = _mm256_setzero_si256();
@@ -87,9 +87,9 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
 
   for (i = 0; i < 64; i++) {
     __m256i r_lo[4], r_hi[4];
-    // load 64 bytes from src and all refs
-    const __m256i s_lo = _mm256_load_si256((const __m256i *)src);
-    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src + 32));
+    // load 64 bytes from src and all ref[]
+    const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
     r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
     r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
     r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
@@ -99,7 +99,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
     r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
     r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
 
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
     r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
     r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
     r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
@@ -109,7 +109,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
     r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
     r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
 
-    // sum every ref-i
+    // sum every ref[]
     sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
     sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
     sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
@@ -119,12 +119,12 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
     sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
     sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
 
-    src += src_stride;
+    src_ptr += src_stride;
     refs[0] += ref_stride;
     refs[1] += ref_stride;
     refs[2] += ref_stride;
     refs[3] += ref_stride;
   }
 
-  calc_final(sums, res);
+  calc_final(sums, sad_array);
 }
index 5f2ab6e..4c5d704 100644 (file)
@@ -11,8 +11,8 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
                             uint32_t res[4]) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
@@ -20,33 +20,33 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
   int i;
   const uint8_t *ref0, *ref1, *ref2, *ref3;
 
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+  ref0 = ref_array[0];
+  ref1 = ref_array[1];
+  ref2 = ref_array[2];
+  ref3 = ref_array[3];
   sum_ref0 = _mm512_set1_epi16(0);
   sum_ref1 = _mm512_set1_epi16(0);
   sum_ref2 = _mm512_set1_epi16(0);
   sum_ref3 = _mm512_set1_epi16(0);
   for (i = 0; i < 64; i++) {
-    // load src and all refs
-    src_reg = _mm512_loadu_si512((const __m512i *)src);
+    // load src and all ref[]
+    src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
     ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
     ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
     ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
     ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
     ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
     ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
     ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
     ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
+    // sum every ref[]
     sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
     sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
     sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
     sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
 
-    src += src_stride;
+    src_ptr += src_stride;
     ref0 += ref_stride;
     ref1 += ref_stride;
     ref2 += ref_stride;
@@ -55,7 +55,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
   {
     __m256i sum256;
     __m128i sum128;
-    // in sum_ref-i the result is saved in the first 4 bytes
+    // in sum_ref[] the result is saved in the first 4 bytes
     // the other 4 bytes are zeroed.
     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
     sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
@@ -65,7 +65,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
     sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
     sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
 
-    // merge every 64 bit from each sum_ref-i
+    // merge every 64 bit from each sum_ref[]
     sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
     sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
 
index d83507d..e6e72b8 100644 (file)
@@ -45,7 +45,7 @@
 
     ;Compute max and min values of a pixel
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movsxd      rcx, DWORD PTR arg(6)      ;bd
     movq        xmm0, rdx
     movq        xmm1, rcx
     pshufd      xmm0, xmm0, 0b
 
     ;Compute max and min values of a pixel
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm0, rdx
     movq        xmm1, rcx
     pshufd      xmm0, xmm0, 0b
index 9bffe50..87bf75e 100644 (file)
@@ -26,7 +26,7 @@
     pshufd      xmm3, xmm3, 0
 
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm5, rdx
     movq        xmm2, rcx
     pshufd      xmm5, xmm5, 0b
@@ -82,7 +82,7 @@
     pshufd      xmm4, xmm4, 0
 
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm8, rdx
     movq        xmm5, rcx
     pshufd      xmm8, xmm8, 0b