clang-tidy: fix vpx_dsp parameters

author Johann <johann.koenig@duck.com>

Tue, 30 Oct 2018 19:59:46 +0000 (12:59 -0700)

committer Johann <johann.koenig@duck.com>

Thu, 1 Nov 2018 19:14:14 +0000 (12:14 -0700)
author Johann <johann.koenig@duck.com>
Tue, 30 Oct 2018 19:59:46 +0000 (12:59 -0700)
committer Johann <johann.koenig@duck.com>
Thu, 1 Nov 2018 19:14:14 +0000 (12:14 -0700)
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c

index a0a1e6d..1e33851 100644 (file)
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -20,12 +20,12 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
    const int16x8_t one = vdupq_n_s16(1);
    const int16x8_t neg_one = vdupq_n_s16(-1);
    uint16x8_t eob_max;
-  (void)scan_ptr;
+  (void)scan;
    (void)skip_block;
    assert(!skip_block);
  
@@ -38,8 +38,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
      const int16x8_t dequant = vld1q_s16(dequant_ptr);
      // Add one because the eob does not index from 0.
-    const uint16x8_t iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -65,10 +65,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      qcoeff = vandq_s16(qcoeff, zbin_mask);
  
      // Set non-zero elements to -1 and use that to extract values for eob.
-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
  
      coeff_ptr += 8;
-    iscan_ptr += 8;
+    iscan += 8;
  
      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
      qcoeff_ptr += 8;
@@ -90,8 +90,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  
      do {
        // Add one because the eob is not its index.
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
        const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
        const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -118,10 +118,10 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  
        // Set non-zero elements to -1 and use that to extract values for eob.
        eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
  
        coeff_ptr += 8;
-      iscan_ptr += 8;
+      iscan += 8;
  
        store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
        qcoeff_ptr += 8;
@@ -150,17 +150,19 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
  
  // Main difference is that zbin values are halved before comparison and dqcoeff
  // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
    const int16x8_t one = vdupq_n_s16(1);
    const int16x8_t neg_one = vdupq_n_s16(-1);
    uint16x8_t eob_max;
    int i;
-  (void)scan_ptr;
+  (void)scan;
    (void)n_coeffs;  // Because we will always calculate 32*32.
    (void)skip_block;
    assert(!skip_block);
@@ -174,8 +176,8 @@ void vpx_quantize_b_32x32_neon(
      const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
      const int16x8_t dequant = vld1q_s16(dequant_ptr);
      // Add one because the eob does not index from 0.
-    const uint16x8_t iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -203,10 +205,10 @@ void vpx_quantize_b_32x32_neon(
      qcoeff = vandq_s16(qcoeff, zbin_mask);
  
      // Set non-zero elements to -1 and use that to extract values for eob.
-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
  
      coeff_ptr += 8;
-    iscan_ptr += 8;
+    iscan += 8;
  
      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
      qcoeff_ptr += 8;
@@ -234,8 +236,8 @@ void vpx_quantize_b_32x32_neon(
  
      for (i = 1; i < 32 * 32 / 8; ++i) {
        // Add one because the eob is not its index.
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
        const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
        const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -264,10 +266,10 @@ void vpx_quantize_b_32x32_neon(
  
        // Set non-zero elements to -1 and use that to extract values for eob.
        eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
  
        coeff_ptr += 8;
-      iscan_ptr += 8;
+      iscan += 8;
  
        store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
        qcoeff_ptr += 8;
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c

index 535ec0f..06443c6 100644 (file)
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -28,24 +28,25 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
    return vreinterpret_u8_u32(aa);
  }
  
-static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
-                            const uint8_t *const ref[4], const int ref_stride,
-                            const int height, uint32_t *const res) {
+static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
+                            const uint8_t *const ref_array[4],
+                            const int ref_stride, const int height,
+                            uint32_t *const res) {
    int i;
    uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
    uint16x4_t a[2];
    uint32x4_t r;
  
-  assert(!((intptr_t)src % sizeof(uint32_t)));
+  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
    assert(!(src_stride % sizeof(uint32_t)));
  
    for (i = 0; i < height; ++i) {
      const uint8x8_t s = vreinterpret_u8_u32(
-        vld1_dup_u32((const uint32_t *)(src + i * src_stride)));
-    const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,
-                                                     ref[1] + i * ref_stride);
-    const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,
-                                                     ref[3] + i * ref_stride);
+        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
+    const uint8x8_t ref01 = load_unaligned_2_buffers(
+        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
+    const uint8x8_t ref23 = load_unaligned_2_buffers(
+        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
      abs[0] = vabal_u8(abs[0], s, ref01);
      abs[1] = vabal_u8(abs[1], s, ref23);
    }
@@ -56,16 +57,16 @@ static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
    vst1q_u32(res, r);
  }
  
-void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad4x_4d(src, src_stride, ref, ref_stride, 4, res);
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
  }
  
-void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad4x_4d(src, src_stride, ref, ref_stride, 8, res);
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
  }
  
  ////////////////////////////////////////////////////////////////////////////////
@@ -137,17 +138,18 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
    vst1q_u32(res, vcombine_u32(d0, d1));
  }
  
-static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
                              uint32_t *res, const int height) {
    int i, j;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
    uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                          vdupq_n_u16(0) };
  
    for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vld1_u8(src);
-    src += src_stride;
+    const uint8x8_t s = vld1_u8(src_ptr);
+    src_ptr += src_stride;
      for (j = 0; j < 4; ++j) {
        const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
        ref_loop[j] += ref_stride;
@@ -158,44 +160,45 @@ static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
    sad_512_pel_final_neon(sum, res);
  }
  
-void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
  }
  
-void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
  }
  
-void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
-                         const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
  }
  
  ////////////////////////////////////////////////////////////////////////////////
  
-static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,
+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                                uint16x8_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref);
-  *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));
-  *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));
+  const uint8x16_t r = vld1q_u8(ref_ptr);
+  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
+  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
  }
  
-static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
-                             const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
                               uint32_t *res, const int height) {
    int i, j;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
    uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                          vdupq_n_u16(0) };
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src);
-    src += src_stride;
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
      for (j = 0; j < 4; ++j) {
        sad16_neon(ref_loop[j], s, &sum[j]);
        ref_loop[j] += ref_stride;
@@ -205,50 +208,51 @@ static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
    sad_512_pel_final_neon(sum, res);
  }
  
-void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
-                         const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
  }
  
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
  }
  
-void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
  }
  
  ////////////////////////////////////////////////////////////////////////////////
  
-static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
-                             const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
                               const int height, uint16x8_t *const sum) {
    int i;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
  
    sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
      uint8x16_t s;
  
-    s = vld1q_u8(src + 0 * 16);
+    s = vld1q_u8(src_ptr + 0 * 16);
      sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
      sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
      sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
  
-    s = vld1q_u8(src + 1 * 16);
+    s = vld1q_u8(src_ptr + 1 * 16);
      sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
      sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
      sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
  
-    src += src_stride;
+    src_ptr += src_stride;
      ref_loop[0] += ref_stride;
      ref_loop[1] += ref_stride;
      ref_loop[2] += ref_stride;
@@ -256,68 +260,69 @@ static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
    }
  }
  
-void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
    uint16x8_t sum[4];
-  sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
    sad_512_pel_final_neon(sum, res);
  }
  
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
    uint16x8_t sum[4];
-  sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
    sad_1024_pel_final_neon(sum, res);
  }
  
-void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
    uint16x8_t sum[4];
-  sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
    sad_2048_pel_final_neon(sum, res);
  }
  
  ////////////////////////////////////////////////////////////////////////////////
  
-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
    int i;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
    uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                          vdupq_n_u16(0) };
  
    for (i = 0; i < 32; ++i) {
      uint8x16_t s;
  
-    s = vld1q_u8(src + 0 * 16);
+    s = vld1q_u8(src_ptr + 0 * 16);
      sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
      sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
      sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
  
-    s = vld1q_u8(src + 1 * 16);
+    s = vld1q_u8(src_ptr + 1 * 16);
      sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
      sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
      sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
  
-    s = vld1q_u8(src + 2 * 16);
+    s = vld1q_u8(src_ptr + 2 * 16);
      sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
      sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
      sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
  
-    s = vld1q_u8(src + 3 * 16);
+    s = vld1q_u8(src_ptr + 3 * 16);
      sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
      sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
      sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
  
-    src += src_stride;
+    src_ptr += src_stride;
      ref_loop[0] += ref_stride;
      ref_loop[1] += ref_stride;
      ref_loop[2] += ref_stride;
@@ -327,11 +332,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
    sad_2048_pel_final_neon(sum, res);
  }
  
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                            uint32_t *res) {
    int i;
-  const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
    uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                          vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                          vdupq_n_u16(0), vdupq_n_u16(0) };
@@ -339,31 +345,31 @@ void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
    for (i = 0; i < 64; ++i) {
      uint8x16_t s;
  
-    s = vld1q_u8(src + 0 * 16);
+    s = vld1q_u8(src_ptr + 0 * 16);
      sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
      sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
      sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
  
-    s = vld1q_u8(src + 1 * 16);
+    s = vld1q_u8(src_ptr + 1 * 16);
      sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
      sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
      sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
      sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
  
-    s = vld1q_u8(src + 2 * 16);
+    s = vld1q_u8(src_ptr + 2 * 16);
      sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
      sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
      sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
      sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
  
-    s = vld1q_u8(src + 3 * 16);
+    s = vld1q_u8(src_ptr + 3 * 16);
      sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
      sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
      sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
      sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
  
-    src += src_stride;
+    src_ptr += src_stride;
      ref_loop[0] += ref_stride;
      ref_loop[1] += ref_stride;
      ref_loop[2] += ref_stride;
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c

index 9518a16..1ce66d3 100644 (file)
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -73,128 +73,132 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
  }
  
-static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, const int height) {
+static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               const int height) {
    int i;
    uint16x8_t abs = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    const uint8x8_t b_u8 = vld1_u8(b);
-    a += a_stride;
-    b += b_stride;
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
      abs = vabal_u8(abs, a_u8, b_u8);
    }
    return abs;
  }
  
-static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   const uint8_t *second_pred,
+                                   const int height) {
    int i;
    uint16x8_t abs = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    const uint8x8_t b_u8 = vld1_u8(b);
-    const uint8x8_t c_u8 = vld1_u8(c);
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t c_u8 = vld1_u8(second_pred);
      const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
-    a += a_stride;
-    b += b_stride;
-    c += 8;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
      abs = vabal_u8(abs, a_u8, avg);
    }
    return abs;
  }
  
-#define sad8xN(n)                                                      \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \
-                               const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
-  }                                                                    \
-                                                                       \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                   const uint8_t *ref, int ref_stride, \
-                                   const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                             \
-        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
+#define sad8xN(n)                                                              \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *ref_ptr, int ref_stride) {       \
+    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
+                                   const uint8_t *ref_ptr, int ref_stride,     \
+                                   const uint8_t *second_pred) {               \
+    const uint16x8_t abs =                                                     \
+        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
    }
  
  sad8xN(4);
  sad8xN(8);
  sad8xN(16);
  
-static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                  const int height) {
    int i;
    uint16x8_t abs = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    const uint8x16_t b_u8 = vld1q_u8(b);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
      abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
      abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
    }
    return abs;
  }
  
-static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
    int i;
    uint16x8_t abs = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    const uint8x16_t b_u8 = vld1q_u8(b);
-    const uint8x16_t c_u8 = vld1q_u8(c);
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t c_u8 = vld1q_u8(second_pred);
      const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
-    a += a_stride;
-    b += b_stride;
-    c += 16;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
      abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
      abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
    }
    return abs;
  }
  
-#define sad16xN(n)                                                      \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                              \
-        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+#define sad16xN(n)                                                            \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint16x8_t abs =                                                    \
+        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint16x8_t abs =                                                    \
+        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
    }
  
  sad16xN(8);
  sad16xN(16);
  sad16xN(32);
  
-static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                  const int height) {
    int i;
    uint16x8_t abs = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(a);
-    const uint8x16_t a_hi = vld1q_u8(a + 16);
-    const uint8x16_t b_lo = vld1q_u8(b);
-    const uint8x16_t b_hi = vld1q_u8(b + 16);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
      abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
      abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
      abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
@@ -203,24 +207,25 @@ static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
    return abs;
  }
  
-static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
    int i;
    uint16x8_t abs = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(a);
-    const uint8x16_t a_hi = vld1q_u8(a + 16);
-    const uint8x16_t b_lo = vld1q_u8(b);
-    const uint8x16_t b_hi = vld1q_u8(b + 16);
-    const uint8x16_t c_lo = vld1q_u8(c);
-    const uint8x16_t c_hi = vld1q_u8(c + 16);
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t c_lo = vld1q_u8(second_pred);
+    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
      const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
      const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
-    a += a_stride;
-    b += b_stride;
-    c += 32;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
      abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
      abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
      abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
@@ -229,43 +234,44 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
    return abs;
  }
  
-#define sad32xN(n)                                                      \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                              \
-        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+#define sad32xN(n)                                                            \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint16x8_t abs =                                                    \
+        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint16x8_t abs =                                                    \
+        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
    }
  
  sad32xN(16);
  sad32xN(32);
  sad32xN(64);
  
-static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                  const int height) {
    int i;
    uint16x8_t abs_0 = vdupq_n_u16(0);
    uint16x8_t abs_1 = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    const uint8x16_t b_0 = vld1q_u8(b);
-    const uint8x16_t b_1 = vld1q_u8(b + 16);
-    const uint8x16_t b_2 = vld1q_u8(b + 32);
-    const uint8x16_t b_3 = vld1q_u8(b + 48);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
      abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
      abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
      abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
@@ -282,33 +288,34 @@ static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
    }
  }
  
-static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
    int i;
    uint16x8_t abs_0 = vdupq_n_u16(0);
    uint16x8_t abs_1 = vdupq_n_u16(0);
  
    for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    const uint8x16_t b_0 = vld1q_u8(b);
-    const uint8x16_t b_1 = vld1q_u8(b + 16);
-    const uint8x16_t b_2 = vld1q_u8(b + 32);
-    const uint8x16_t b_3 = vld1q_u8(b + 48);
-    const uint8x16_t c_0 = vld1q_u8(c);
-    const uint8x16_t c_1 = vld1q_u8(c + 16);
-    const uint8x16_t c_2 = vld1q_u8(c + 32);
-    const uint8x16_t c_3 = vld1q_u8(c + 48);
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t c_0 = vld1q_u8(second_pred);
+    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
      const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
      const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
      const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
      const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
-    a += a_stride;
-    b += b_stride;
-    c += 64;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
      abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
      abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
      abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
@@ -325,19 +332,20 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
    }
  }
  
-#define sad64xN(n)                                                      \
-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint32x4_t abs =                                              \
-        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+#define sad64xN(n)                                                            \
+  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t abs =                                                    \
+        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t abs =                                                    \
+        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
    }
  
  sad64xN(32);
diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h

index ec3975e..11579c9 100644 (file)
--- a/vpx_dsp/bitwriter.h
+++ b/vpx_dsp/bitwriter.h
@@ -27,8 +27,8 @@ typedef struct vpx_writer {
    uint8_t *buffer;
  } vpx_writer;
  
-void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
-void vpx_stop_encode(vpx_writer *bc);
+void vpx_start_encode(vpx_writer *br, uint8_t *source);
+void vpx_stop_encode(vpx_writer *br);
  
  static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
    unsigned int split;
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c

index 94acbb3..455b73b 100644 (file)
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -39,11 +39,10 @@ const int16_t vpx_rv[] = {
    9,  10, 13,
  };
  
-void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
-                                            unsigned char *dst_ptr,
-                                            int src_pixels_per_line,
-                                            int dst_pixels_per_line, int cols,
-                                            unsigned char *f, int size) {
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,
+                                            unsigned char *dst, int src_pitch,
+                                            int dst_pitch, int cols,
+                                            unsigned char *flimits, int size) {
    unsigned char *p_src, *p_dst;
    int row;
    int col;
@@ -55,19 +54,21 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
  
    for (row = 0; row < size; row++) {
      /* post_proc_down for one row */
-    p_src = src_ptr;
-    p_dst = dst_ptr;
+    p_src = src;
+    p_dst = dst;
  
      for (col = 0; col < cols; col++) {
-      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
-      unsigned char p_above1 = p_src[col - src_pixels_per_line];
-      unsigned char p_below1 = p_src[col + src_pixels_per_line];
-      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+      unsigned char p_above2 = p_src[col - 2 * src_pitch];
+      unsigned char p_above1 = p_src[col - src_pitch];
+      unsigned char p_below1 = p_src[col + src_pitch];
+      unsigned char p_below2 = p_src[col + 2 * src_pitch];
  
        v = p_src[col];
  
-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
-          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+      if ((abs(v - p_above2) < flimits[col]) &&
+          (abs(v - p_above1) < flimits[col]) &&
+          (abs(v - p_below1) < flimits[col]) &&
+          (abs(v - p_below2) < flimits[col])) {
          unsigned char k1, k2, k3;
          k1 = (p_above2 + p_above1 + 1) >> 1;
          k2 = (p_below2 + p_below1 + 1) >> 1;
@@ -79,8 +80,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
      }
  
      /* now post_proc_across */
-    p_src = dst_ptr;
-    p_dst = dst_ptr;
+    p_src = dst;
+    p_dst = dst;
  
      p_src[-2] = p_src[-1] = p_src[0];
      p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
@@ -88,10 +89,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
      for (col = 0; col < cols; col++) {
        v = p_src[col];
  
-      if ((abs(v - p_src[col - 2]) < f[col]) &&
-          (abs(v - p_src[col - 1]) < f[col]) &&
-          (abs(v - p_src[col + 1]) < f[col]) &&
-          (abs(v - p_src[col + 2]) < f[col])) {
+      if ((abs(v - p_src[col - 2]) < flimits[col]) &&
+          (abs(v - p_src[col - 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 2]) < flimits[col])) {
          unsigned char k1, k2, k3;
          k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
          k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@@ -109,8 +110,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
      p_dst[col - 1] = d[(col - 1) & 3];
  
      /* next row */
-    src_ptr += src_pixels_per_line;
-    dst_ptr += dst_pixels_per_line;
+    src += src_pitch;
+    dst += dst_pitch;
    }
  }
  
diff --git a/vpx_dsp/fwd_txfm.c b/vpx_dsp/fwd_txfm.c

index 6dcb3ba..ef66de0 100644 (file)
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -87,11 +87,11 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
    output[0] = sum * 2;
  }
  
-void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {
    int i, j;
    tran_low_t intermediate[64];
    int pass;
-  tran_low_t *output = intermediate;
+  tran_low_t *out = intermediate;
    const tran_low_t *in = NULL;
  
    // Transform columns
@@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
        t1 = (x0 - x1) * cospi_16_64;
        t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
        t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
+      out[0] = (tran_low_t)fdct_round_shift(t0);
+      out[2] = (tran_low_t)fdct_round_shift(t2);
+      out[4] = (tran_low_t)fdct_round_shift(t1);
+      out[6] = (tran_low_t)fdct_round_shift(t3);
  
        // Stage 2
        t0 = (s6 - s5) * cospi_16_64;
@@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
+      out[1] = (tran_low_t)fdct_round_shift(t0);
+      out[3] = (tran_low_t)fdct_round_shift(t2);
+      out[5] = (tran_low_t)fdct_round_shift(t1);
+      out[7] = (tran_low_t)fdct_round_shift(t3);
+      out += 8;
      }
      in = intermediate;
-    output = final_output;
+    out = output;
    }
  
    // Rows
    for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+    for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;
    }
  }
  
@@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
    output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
  }
  
-void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {
    int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
  
    // Columns
    for (i = 0; i < 32; ++i) {
@@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
      for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
      vpx_fdct32(temp_in, temp_out, 0);
      for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
    }
  
    // Rows
    for (i = 0; i < 32; ++i) {
      tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
      vpx_fdct32(temp_in, temp_out, 0);
      for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
+      output[j + i * 32] =
            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
    }
  }
@@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
  // Note that although we use dct_32_round in dct32 computation flow,
  // this 2d fdct32x32 for rate-distortion optimization loop is operating
  // within 16 bits precision.
-void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {
    int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
  
    // Columns
    for (i = 0; i < 32; ++i) {
@@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
        // TODO(cd): see quality impact of only doing
        //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
        //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
    }
  
    // Rows
    for (i = 0; i < 32; ++i) {
      tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
      vpx_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+    for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];
    }
  }
  
@@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
    vpx_fdct4x4_c(input, output, stride);
  }
  
-void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,
                            int stride) {
-  vpx_fdct8x8_c(input, final_output, stride);
+  vpx_fdct8x8_c(input, output, stride);
  }
  
-void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,
                              int stride) {
-  vpx_fdct8x8_1_c(input, final_output, stride);
+  vpx_fdct8x8_1_c(input, output, stride);
  }
  
  void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
@@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
    vpx_fdct16x16_1_c(input, output, stride);
  }
  
-void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  vpx_fdct32x32_c(input, out, stride);
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vpx_fdct32x32_c(input, output, stride);
  }
  
-void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,
                                 int stride) {
-  vpx_fdct32x32_rd_c(input, out, stride);
+  vpx_fdct32x32_rd_c(input, output, stride);
  }
  
-void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,
                                int stride) {
-  vpx_fdct32x32_1_c(input, out, stride);
+  vpx_fdct32x32_1_c(input, output, stride);
  }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c

index 0194aa1..69de05e 100644 (file)
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    int i;
    tran_high_t a1, e1;
    tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
    tran_low_t *op = tmp;
  
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1346,12 +1346,12 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
    }
  }
  
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
                                  int stride, int bd) {
    int i;
    tran_high_t a1, e1;
    tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
    tran_low_t *op = tmp;
    (void)bd;
  
diff --git a/vpx_dsp/loopfilter.c b/vpx_dsp/loopfilter.c

index 9866ea3..47f30c9 100644 (file)
--- a/vpx_dsp/loopfilter.c
+++ b/vpx_dsp/loopfilter.c
@@ -109,29 +109,30 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
  }
  
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
    int i;
  
    // loop filter designed to work using chars so that we can make maximum use
    // of 8 bit simd instructions.
    for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
      const int8_t mask =
          filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
      ++s;
    }
  }
  
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
  }
  
  void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
    }
  }
  
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
    int i;
  
    // loop filter designed to work using chars so that we can make maximum use
    // of 8 bit simd instructions.
    for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
  
      const int8_t mask =
          filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-            s + 1 * p, s + 2 * p, s + 3 * p);
+    filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
+            s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
      ++s;
    }
  }
  
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
  }
  
  void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
    }
  }
  
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int count) {
    int i;
@@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
    // loop filter designed to work using chars so that we can make maximum use
    // of 8 bit simd instructions.
    for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
      const int8_t mask =
          filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 =
-        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
-             s + 7 * p);
+    const int8_t flat2 = flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
+
+    filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+             s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+             s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
+             s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
+             s + 7 * pitch);
      ++s;
    }
  }
  
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                               const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
  }
  
-void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
  }
  
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
                                     const uint8_t *limit, const uint8_t *thresh,
                                     int count) {
    int i;
@@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
      filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
               s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
               s + 7);
-    s += p;
+    s += pitch;
    }
  }
  
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
  }
  
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
  }
  
  #if CONFIG_VP9_HIGHBITDEPTH
@@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
    *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
  }
  
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
                                     const uint8_t *blimit, const uint8_t *limit,
                                     const uint8_t *thresh, int bd) {
    int i;
@@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
    // loop filter designed to work using chars so that we can make maximum use
    // of 8 bit simd instructions.
    for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
      const int8_t mask =
          highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
+                   s + 1 * pitch, bd);
      ++s;
    }
  }
  
  void vpx_highbd_lpf_horizontal_4_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
      const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
  }
  
  void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
    }
  }
  
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
    int i;
  
    // loop filter designed to work using chars so that we can make maximum use
    // of 8 bit simd instructions.
    for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                   p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                   q3 = s[3 * pitch];
  
      const int8_t mask =
          highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
      const int8_t flat =
          highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
+                   s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                   s + 2 * pitch, s + 3 * pitch, bd);
      ++s;
    }
  }
  
  void vpx_highbd_lpf_horizontal_8_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
      const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
  }
  
  void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
    }
  }
  
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
                                              const uint8_t *blimit,
                                              const uint8_t *limit,
                                              const uint8_t *thresh, int count,
@@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
    // loop filter designed to work using chars so that we can make maximum use
    // of 8 bit simd instructions.
    for (i = 0; i < 8 * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
      const int8_t mask =
          highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
      const int8_t flat =
          highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat2 =
-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
-                    s + 6 * p, s + 7 * p, bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+                    s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+                    s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                    s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
+                    s + 6 * pitch, s + 7 * pitch, bd);
      ++s;
    }
  }
  
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
  }
  
-void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
  }
  
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
                                            const uint8_t *blimit,
                                            const uint8_t *limit,
                                            const uint8_t *thresh, int count,
@@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
      highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
                      s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
                      s + 5, s + 6, s + 7, bd);
-    s += p;
+    s += pitch;
    }
  }
  
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
  }
  
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
                                         const uint8_t *blimit,
                                         const uint8_t *limit,
                                         const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
  }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c

index 9ef0483..1707d32 100644 (file)
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -508,11 +508,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
    }
  }
  
-void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
-                                   int32_t rows, int32_t cols, int32_t flimit) {
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,
+                                   int32_t cols, int32_t flimit) {
    int32_t row, col, cnt;
-  uint8_t *src_dup = src_ptr;
-  v16u8 src0, src, tmp_orig;
+  uint8_t *src_dup = src;
+  v16u8 src0, src1, tmp_orig;
    v16u8 tmp = { 0 };
    v16i8 zero = { 0 };
    v8u16 sum_h, src_r_h, src_l_h;
@@ -531,13 +531,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
      src_dup[cols + 16] = src_dup[cols - 1];
      tmp_orig = (v16u8)__msa_ldi_b(0);
      tmp_orig[15] = tmp[15];
-    src = LD_UB(src_dup - 8);
-    src[15] = 0;
-    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+    src1 = LD_UB(src_dup - 8);
+    src1[15] = 0;
+    ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
      src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
      src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
      sum_sq = HADD_SW_S32(src_r_w) + 16;
-    sum_h = __msa_hadd_u_h(src, src);
+    sum_h = __msa_hadd_u_h(src1, src1);
      sum = HADD_UH_U32(sum_h);
      {
        v16u8 src7, src8, src_r, src_l;
@@ -566,8 +566,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
            sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
          }
          sum = sum_l[7];
-        src = LD_UB(src_dup + 16 * col);
-        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src1 = LD_UB(src_dup + 16 * col);
+        ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
          src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
          src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
          tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
@@ -613,7 +613,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
          total3 = (total3 < flimit_vec);
          PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
          mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
-        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+        tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);
  
          if (col == 0) {
            uint64_t src_d;
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c

index e37ca92..82a6595 100644 (file)
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -17,7 +17,7 @@
  void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                       const int16_t *round_ptr, const int16_t quant,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                     const int16_t dequant, uint16_t *eob_ptr) {
    const int rc = 0;
    const int coeff = coeff_ptr[rc];
    const int coeff_sign = (coeff >> 31);
@@ -31,7 +31,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
      tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
      tmp = (tmp * quant) >> 16;
      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
      if (tmp) eob = 0;
    }
    *eob_ptr = eob + 1;
@@ -41,7 +41,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
  void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                              int skip_block, const int16_t *round_ptr,
                              const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
                              uint16_t *eob_ptr) {
    int eob = -1;
  
@@ -55,7 +55,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
      const int64_t tmp = abs_coeff + round_ptr[0];
      const int abs_qcoeff = (int)((tmp * quant) >> 16);
      qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
      if (abs_qcoeff) eob = 0;
    }
    *eob_ptr = eob + 1;
@@ -65,7 +65,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
  void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                             const int16_t *round_ptr, const int16_t quant,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                           const int16_t dequant, uint16_t *eob_ptr) {
    const int n_coeffs = 1024;
    const int rc = 0;
    const int coeff = coeff_ptr[rc];
@@ -81,7 +81,7 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                  INT16_MIN, INT16_MAX);
      tmp = (tmp * quant) >> 15;
      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
      if (tmp) eob = 0;
    }
    *eob_ptr = eob + 1;
@@ -92,8 +92,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                    const int16_t *round_ptr, const int16_t quant,
                                    tran_low_t *qcoeff_ptr,
                                    tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
+                                  const int16_t dequant, uint16_t *eob_ptr) {
    const int n_coeffs = 1024;
    int eob = -1;
  
@@ -107,7 +106,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
      const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
      const int abs_qcoeff = (int)((tmp * quant) >> 15);
      qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
      if (abs_qcoeff) eob = 0;
    }
    *eob_ptr = eob + 1;
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h

index 94c8206..7cac140 100644 (file)
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -19,26 +19,25 @@ extern "C" {
  #endif
  
  void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     const int16_t *round_ptr, const int16_t quant,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+                     const int16_t dequant, uint16_t *eob_ptr);
  void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           const int16_t *round_ptr, const int16_t quant,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+                           const int16_t dequant, uint16_t *eob_ptr);
  
  #if CONFIG_VP9_HIGHBITDEPTH
  void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                              int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
                              uint16_t *eob_ptr);
  void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
+                                  const int16_t *round_ptr, const int16_t quant,
                                    tran_low_t *qcoeff_ptr,
                                    tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
+                                  const int16_t dequant, uint16_t *eob_ptr);
  #endif
  
  #ifdef __cplusplus
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c

index d4a5329..873ddca 100644 (file)
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -17,54 +17,55 @@
  #include "vpx_ports/mem.h"
  
  /* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               int width, int height) {
    int y, x;
    unsigned int sad = 0;
  
    for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);
  
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
    }
    return sad;
  }
  
-#define sadMxN(m, n)                                                        \
-  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                         \
-    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
+#define sadMxN(m, n)                                                          \
+  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);               \
+  }                                                                           \
+  unsigned int vpx_sad##m##x##n##_avg_c(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
+    return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
    }
  
  // depending on call sites, pass **ref_array to avoid & in subsequent call and
  // de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+#define sadMxNxK(m, n, k)                                                     \
+  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *ref_ptr, int ref_stride,     \
+                                  uint32_t *sad_array) {                      \
+    int i;                                                                    \
+    for (i = 0; i < k; ++i)                                                   \
+      sad_array[i] =                                                          \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
    }
  
  // This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n)                                                    \
-  void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i)                                                \
-      sad_array[i] =                                                       \
-          vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+#define sadMxNx4D(m, n)                                                        \
+  void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *const ref_array[],               \
+                               int ref_stride, uint32_t *sad_array) {          \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i)                                                    \
+      sad_array[i] =                                                           \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
    }
  
  /* clang-format off */
@@ -133,60 +134,61 @@ sadMxNx4D(4, 4)
  
  #if CONFIG_VP9_HIGHBITDEPTH
          static INLINE
-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
+    unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,
+                            const uint8_t *ref8_ptr, int ref_stride, int width,
+                            int height) {
    int y, x;
    unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
    for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
  
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
    }
    return sad;
  }
  
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
+                                       const uint16_t *ref_ptr, int ref_stride,
                                         int width, int height) {
    int y, x;
    unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
    for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
  
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
    }
    return sad;
  }
  
  #define highbd_sadMxN(m, n)                                                    \
-  unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref,                 \
-                                           int ref_stride) {                   \
-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  unsigned int vpx_highbd_sad##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride) {                                                        \
+    return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);         \
    }                                                                            \
    unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred) {                                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, const uint8_t *second_pred) {                            \
      DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]);                           \
      vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
-                               n, CONVERT_TO_SHORTPTR(ref), ref_stride);       \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+                               n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
+    return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
    }
  
-#define highbd_sadMxNx4D(m, n)                                               \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
+#define highbd_sadMxNx4D(m, n)                                                \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+                                      const uint8_t *const ref_array[],       \
+                                      int ref_stride, uint32_t *sad_array) {  \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \
+                                                 ref_array[i], ref_stride);   \
+    }                                                                         \
    }
  
  /* clang-format off */
diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c

index 95e7071..45c819e 100644 (file)
--- a/vpx_dsp/subtract.c
+++ b/vpx_dsp/subtract.c
@@ -16,37 +16,37 @@
  #include "vpx/vpx_integer.h"
  #include "vpx_ports/mem.h"
  
-void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
-                          ptrdiff_t diff_stride, const uint8_t *src,
-                          ptrdiff_t src_stride, const uint8_t *pred,
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                          ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                          ptrdiff_t src_stride, const uint8_t *pred_ptr,
                            ptrdiff_t pred_stride) {
    int r, c;
  
    for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+    for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];
  
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
+    diff_ptr += diff_stride;
+    pred_ptr += pred_stride;
+    src_ptr += src_stride;
    }
  }
  
  #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
-                                 ptrdiff_t diff_stride, const uint8_t *src8,
-                                 ptrdiff_t src_stride, const uint8_t *pred8,
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                                 ptrdiff_t diff_stride, const uint8_t *src8_ptr,
+                                 ptrdiff_t src_stride, const uint8_t *pred8_ptr,
                                   ptrdiff_t pred_stride, int bd) {
    int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
    (void)bd;
  
    for (r = 0; r < rows; r++) {
      for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
+      diff_ptr[c] = src[c] - pred[c];
      }
  
-    diff += diff_stride;
+    diff_ptr += diff_stride;
      pred += pred_stride;
      src += src_stride;
    }
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 13d4b3d..f4f89b9 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -37,322 +37,322 @@ if ($opts{arch} eq "x86_64") {
  # Intra prediction
  #
  
-add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d207_predictor_4x4 sse2/;
  
-add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d45_predictor_4x4 neon sse2/;
  
-add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d63_predictor_4x4 ssse3/;
  
-add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;
  
-add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d135_predictor_4x4 neon/;
  
-add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d153_predictor_4x4 ssse3/;
  
-add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
  
-add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
  
-add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
  
-add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
  
-add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
  
-add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d207_predictor_8x8 ssse3/;
  
-add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;
  
-add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;
  
-add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;
  
-add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d135_predictor_8x8 neon/;
  
-add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d153_predictor_8x8 ssse3/;
  
-add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
  
-add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
  
-add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
  
-add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
  
-add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d207_predictor_16x16 ssse3/;
  
-add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
  
-add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
  
-add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
  
-add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d135_predictor_16x16 neon/;
  
-add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d153_predictor_16x16 ssse3/;
  
-add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d207_predictor_32x32 ssse3/;
  
-add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
  
-add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
  
-add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  
-add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d135_predictor_32x32 neon/;
  
-add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_d153_predictor_32x32 ssse3/;
  
-add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
  
-add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
  
-add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
  
-add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
  
-add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
  specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
  
  # High bitdepth functions
  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
  
-  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
  
-  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
  
-  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
  
-  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
  
-  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
  
-  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
  
-  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
  
-  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
  
-  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
  
-  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
  
-  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
  
-  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
  
-  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
  
-  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
  
-  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
  
-  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
  
-  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
  
-  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
  
-  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
  
-  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
    specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
  }  # CONFIG_VP9_HIGHBITDEPTH
  
@@ -400,28 +400,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    #
    # Sub Pixel Filters
    #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
  
-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
  
-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
  
-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
  
-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
  
-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
  
-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
  
-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
    specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
  }  # CONFIG_VP9_HIGHBITDEPTH
  
@@ -888,43 +888,43 @@ specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
  #
  # Multi-block SAD, comparing a reference to N independent blocks
  #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
  
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
  
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
  
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
  
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
  
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
  specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
  
  add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
@@ -945,7 +945,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    #
    # Block subtraction
    #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
  
    #
    # Single block SAD
@@ -990,13 +990,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    #
    # Avg
    #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
    specialize qw/vpx_highbd_avg_8x8 sse2/;
  
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
    specialize qw/vpx_highbd_avg_4x4 sse2/;
  
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
  
    add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
    specialize qw/vpx_highbd_sad64x64_avg sse2/;
@@ -1038,43 +1038,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    #
    # Multi-block SAD, comparing a reference to N independent blocks
    #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad64x64x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad64x32x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad32x64x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad32x32x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad32x16x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad16x32x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad16x16x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad16x8x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad8x16x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad8x8x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad8x4x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad4x8x4d sse2/;
  
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
    specialize qw/vpx_highbd_sad4x4x4d sse2/;
  
    #
@@ -1610,7 +1610,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
      add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
      specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
  
-    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";
      specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
  
      add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
diff --git a/vpx_dsp/x86/avg_pred_sse2.c b/vpx_dsp/x86/avg_pred_sse2.c

index e7db755..e4e1e0e 100644 (file)
--- a/vpx_dsp/x86/avg_pred_sse2.c
+++ b/vpx_dsp/x86/avg_pred_sse2.c
@@ -15,10 +15,10 @@
  #include "vpx/vpx_integer.h"
  #include "vpx_dsp/x86/mem_sse2.h"
  
-void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
                              int height, const uint8_t *ref, int ref_stride) {
-  /* comp and pred must be 16 byte aligned. */
-  assert(((intptr_t)comp & 0xf) == 0);
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
    assert(((intptr_t)pred & 0xf) == 0);
    if (width > 8) {
      int x, y;
@@ -27,17 +27,17 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
          const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
          const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
          const __m128i avg = _mm_avg_epu8(p, r);
-        _mm_store_si128((__m128i *)(comp + x), avg);
+        _mm_store_si128((__m128i *)(comp_pred + x), avg);
        }
-      comp += width;
+      comp_pred += width;
        pred += width;
        ref += ref_stride;
      }
    } else {  // width must be 4 or 8.
      int i;
-    // Process 16 elements at a time. comp and pred have width == stride and
-    // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all
-    // divisible by 16 so just ref needs to be massaged when loading.
+    // Process 16 elements at a time. comp_pred and pred have width == stride
+    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+    // all divisible by 16 so just ref needs to be massaged when loading.
      for (i = 0; i < width * height; i += 16) {
        const __m128i p = _mm_load_si128((const __m128i *)pred);
        __m128i r;
@@ -60,10 +60,10 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
          ref += 2 * ref_stride;
        }
        avg = _mm_avg_epu8(p, r);
-      _mm_store_si128((__m128i *)comp, avg);
+      _mm_store_si128((__m128i *)comp_pred, avg);
  
        pred += 16;
-      comp += 16;
+      comp_pred += 16;
      }
    }
  }
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h

index aa60f44..8398ec3 100644 (file)
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -23,59 +23,59 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
  #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
    void vpx_convolve8_##name##_##opt(                                         \
        const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
        int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
-    const int16_t *filter = filter_kernel[offset];                           \
+    const int16_t *filter_row = filter[offset];                              \
      (void)x0_q4;                                                             \
      (void)x_step_q4;                                                         \
      (void)y0_q4;                                                             \
      (void)y_step_q4;                                                         \
-    assert(filter[3] != 128);                                                \
+    assert(filter_row[3] != 128);                                            \
      assert(step_q4 == 16);                                                   \
-    if (filter[0] | filter[1] | filter[6] | filter[7]) {                     \
+    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
        while (w >= 16) {                                                      \
          vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
          src += 16;                                                           \
          dst += 16;                                                           \
          w -= 16;                                                             \
        }                                                                      \
        if (w == 8) {                                                          \
          vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
        } else if (w == 4) {                                                   \
          vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
        }                                                                      \
-    } else if (filter[2] | filter[5]) {                                      \
+    } else if (filter_row[2] | filter_row[5]) {                              \
        while (w >= 16) {                                                      \
          vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
          src += 16;                                                           \
          dst += 16;                                                           \
          w -= 16;                                                             \
        }                                                                      \
        if (w == 8) {                                                          \
          vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
        } else if (w == 4) {                                                   \
          vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
        }                                                                      \
      } else {                                                                 \
        while (w >= 16) {                                                      \
          vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
          src += 16;                                                           \
          dst += 16;                                                           \
          w -= 16;                                                             \
        }                                                                      \
        if (w == 8) {                                                          \
          vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
        } else if (w == 4) {                                                   \
          vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
        }                                                                      \
      }                                                                        \
    }
@@ -121,86 +121,86 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                         unsigned int output_height,
                                         const int16_t *filter, int bd);
  
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \
-  void vpx_highbd_convolve8_##name##_##opt(                                   \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
-      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
-    const int16_t *filter = filter_kernel[offset];                            \
-    if (step_q4 == 16 && filter[3] != 128) {                                  \
-      if (filter[0] | filter[1] | filter[6] | filter[7]) {                    \
-        while (w >= 16) {                                                     \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 16;                                                          \
-          dst += 16;                                                          \
-          w -= 16;                                                            \
-        }                                                                     \
-        while (w >= 8) {                                                      \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 8;                                                           \
-          dst += 8;                                                           \
-          w -= 8;                                                             \
-        }                                                                     \
-        while (w >= 4) {                                                      \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 4;                                                           \
-          dst += 4;                                                           \
-          w -= 4;                                                             \
-        }                                                                     \
-      } else if (filter[2] | filter[5]) {                                     \
-        while (w >= 16) {                                                     \
-          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 16;                                                          \
-          dst += 16;                                                          \
-          w -= 16;                                                            \
-        }                                                                     \
-        while (w >= 8) {                                                      \
-          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 8;                                                           \
-          dst += 8;                                                           \
-          w -= 8;                                                             \
-        }                                                                     \
-        while (w >= 4) {                                                      \
-          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
-          src += 4;                                                           \
-          dst += 4;                                                           \
-          w -= 4;                                                             \
-        }                                                                     \
-      } else {                                                                \
-        while (w >= 16) {                                                     \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
-          src += 16;                                                          \
-          dst += 16;                                                          \
-          w -= 16;                                                            \
-        }                                                                     \
-        while (w >= 8) {                                                      \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
-          src += 8;                                                           \
-          dst += 8;                                                           \
-          w -= 8;                                                             \
-        }                                                                     \
-        while (w >= 4) {                                                      \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
-          src += 4;                                                           \
-          dst += 4;                                                           \
-          w -= 4;                                                             \
-        }                                                                     \
-      }                                                                       \
-    }                                                                         \
-    if (w) {                                                                  \
-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
-                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
-                                      y_step_q4, w, h, bd);                   \
-    }                                                                         \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)  \
+  void vpx_highbd_convolve8_##name##_##opt(                                \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,            \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,         \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {     \
+    const int16_t *filter_row = filter[offset];                            \
+    if (step_q4 == 16 && filter_row[3] != 128) {                           \
+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+        while (w >= 16) {                                                  \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else if (filter_row[2] | filter_row[5]) {                          \
+        while (w >= 16) {                                                  \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else {                                                             \
+        while (w >= 16) {                                                  \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+    if (w) {                                                               \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,    \
+                                      filter, x0_q4, x_step_q4, y0_q4,     \
+                                      y_step_q4, w, h, bd);                \
+    }                                                                      \
    }
  
  #define HIGH_FUN_CONV_2D(avg, opt)                                             \
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c

index ff5ef5f..0ffa7f2 100644 (file)
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -20,7 +20,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
                                     const InterpKernel *filter, int x0_q4,
                                     int x_step_q4, int y0_q4, int y_step_q4,
-                                   int width, int h, int bd) {
+                                   int w, int h, int bd) {
    (void)filter;
    (void)x0_q4;
    (void)x_step_q4;
@@ -28,8 +28,8 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
    (void)y_step_q4;
    (void)bd;
  
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
      do {
        const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
        const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
@@ -43,7 +43,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride;
        h--;
      } while (h > 0);
-  } else if (width > 16) {  // width = 32
+  } else if (w > 16) {  // w = 32
      do {
        const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
        const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
@@ -53,7 +53,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride;
        h--;
      } while (h > 0);
-  } else if (width > 8) {  // width = 16
+  } else if (w > 8) {  // w = 16
      __m256i p0, p1;
      do {
        p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -67,7 +67,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride;
        h -= 2;
      } while (h > 0);
-  } else if (width > 4) {  // width = 8
+  } else if (w > 4) {  // w = 8
      __m128i p0, p1;
      do {
        p0 = _mm_loadu_si128((const __m128i *)src);
@@ -81,7 +81,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride;
        h -= 2;
      } while (h > 0);
-  } else {  // width = 4
+  } else {  // w = 4
      __m128i p0, p1;
      do {
        p0 = _mm_loadl_epi64((const __m128i *)src);
@@ -102,7 +102,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *filter, int x0_q4,
                                    int x_step_q4, int y0_q4, int y_step_q4,
-                                  int width, int h, int bd) {
+                                  int w, int h, int bd) {
    (void)filter;
    (void)x0_q4;
    (void)x_step_q4;
@@ -110,8 +110,8 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
    (void)y_step_q4;
    (void)bd;
  
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
      __m256i p0, p1, p2, p3, u0, u1, u2, u3;
      do {
        p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -130,7 +130,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride;
        h--;
      } while (h > 0);
-  } else if (width > 16) {  // width = 32
+  } else if (w > 16) {  // w = 32
      __m256i p0, p1, u0, u1;
      do {
        p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -143,7 +143,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride;
        h--;
      } while (h > 0);
-  } else if (width > 8) {  // width = 16
+  } else if (w > 8) {  // w = 16
      __m256i p0, p1, u0, u1;
      do {
        p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -158,7 +158,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride << 1;
        h -= 2;
      } while (h > 0);
-  } else if (width > 4) {  // width = 8
+  } else if (w > 4) {  // w = 8
      __m128i p0, p1, u0, u1;
      do {
        p0 = _mm_loadu_si128((const __m128i *)src);
@@ -172,7 +172,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
        dst += dst_stride << 1;
        h -= 2;
      } while (h > 0);
-  } else {  // width = 4
+  } else {  // w = 4
      __m128i p0, p1, u0, u1;
      do {
        p0 = _mm_loadl_epi64((const __m128i *)src);
diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm

index c61b621..caf506a 100644 (file)
--- a/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
    REP_RET
  
  INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
    movd                  m1, [aboveq-2]
    movq                  m0, [aboveq]
    pshuflw               m1, m1, 0x0
@@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
    movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
    ; Get the values to compute the maximum value at this bit depth
    pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
    psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
    psllw                 m3, m4
    pcmpeqw               m2, m2
@@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
    RET
  
  INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
    movd                  m1, [aboveq-2]
    mova                  m0, [aboveq]
    pshuflw               m1, m1, 0x0
@@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
    pxor                  m3, m3
    pxor                  m4, m4
    pinsrw                m3, oned, 0
-  pinsrw                m4, bpsd, 0
+  pinsrw                m4, bdd, 0
    pshuflw               m3, m3, 0x0
    DEFINE_ARGS dst, stride, line, left
    punpcklqdq            m3, m3
@@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
    REP_RET
  
  INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
    movd                  m2, [aboveq-2]
    mova                  m0, [aboveq]
    mova                  m1, [aboveq+16]
    pshuflw               m2, m2, 0x0
    ; Get the values to compute the maximum value at this bit depth
    pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
    punpcklqdq            m2, m2
    psllw                 m3, m4
    pcmpeqw               m5, m5
@@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
    REP_RET
  
  INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
    movd                  m0, [aboveq-2]
    mova                  m1, [aboveq]
    mova                  m2, [aboveq+16]
@@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
    pshuflw               m0, m0, 0x0
    ; Get the values to compute the maximum value at this bit depth
    pcmpeqw               m5, m5
-  movd                  m6, bpsd
+  movd                  m6, bdd
    psllw                 m5, m6
    pcmpeqw               m7, m7
    pxor                  m6, m6         ; min possible value
diff --git a/vpx_dsp/x86/highbd_loopfilter_sse2.c b/vpx_dsp/x86/highbd_loopfilter_sse2.c

index ec22db9..f7fb40d 100644 (file)
--- a/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -47,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
  
  // TODO(debargha, peter): Break up large functions into smaller ones
  // in this file.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
-                                       const uint8_t *_blimit,
-                                       const uint8_t *_limit,
-                                       const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
    const __m128i zero = _mm_set1_epi16(0);
    const __m128i one = _mm_set1_epi16(1);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
    __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
    __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
    __m128i ps1, qs1, ps0, qs0;
@@ -70,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    __m128i eight, four;
  
    if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
    } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
    } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
    }
  
-  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
-  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
-  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
-  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
  
    //  highbd_filter_mask
    abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
@@ -111,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
  
    //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
    flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
    hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
  
    abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
    mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
    work = _mm_max_epi16(
        _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
        _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
@@ -132,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
        _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
    mask = _mm_max_epi16(work, mask);
  
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
    mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
  
    // lp filter
@@ -207,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    // (because, in both vars, each block of 16 either all 1s or all 0s)
    flat = _mm_and_si128(flat, mask);
  
-  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
-  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
-  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
-  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
-  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
-  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+  p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
  
    // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
    // but referred to as p0-p4 & q0-q4 in fn)
@@ -389,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    flat2_q6 = _mm_and_si128(flat2, flat2_q6);
    //  get values for when (flat2 && flat && mask)
    q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
-  _mm_store_si128((__m128i *)(s - 7 * p), p6);
-  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+  _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+  _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
  
    p5 = _mm_andnot_si128(flat2, p5);
    //  p5 remains unchanged if !(flat2 && flat && mask)
@@ -404,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    //  get values for when (flat2 && flat && mask)
    q5 = _mm_or_si128(q5, flat2_q5);
    //  full list of q5 values
-  _mm_store_si128((__m128i *)(s - 6 * p), p5);
-  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+  _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+  _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
  
    p4 = _mm_andnot_si128(flat2, p4);
    //  p4 remains unchanged if !(flat2 && flat && mask)
@@ -417,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    flat2_q4 = _mm_and_si128(flat2, flat2_q4);
    //  get values for when (flat2 && flat && mask)
    q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
-  _mm_store_si128((__m128i *)(s - 5 * p), p4);
-  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+  _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+  _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
  
    p3 = _mm_andnot_si128(flat2, p3);
    //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -430,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    flat2_q3 = _mm_and_si128(flat2, flat2_q3);
    //  get values for when (flat2 && flat && mask)
    q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
-  _mm_store_si128((__m128i *)(s - 4 * p), p3);
-  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+  _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+  _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
  
    p2 = _mm_andnot_si128(flat2, p2);
    //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -444,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    flat2_q2 = _mm_and_si128(flat2, flat2_q2);
    //  get values for when (flat2 && flat && mask)
    q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
  
    p1 = _mm_andnot_si128(flat2, p1);
    //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -457,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    flat2_q1 = _mm_and_si128(flat2, flat2_q1);
    //  get values for when (flat2 && flat && mask)
    q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
  
    p0 = _mm_andnot_si128(flat2, p0);
    //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -470,22 +470,22 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
    flat2_q0 = _mm_and_si128(flat2, flat2_q0);
    //  get values for when (flat2 && flat && mask)
    q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
  }
  
-void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
-  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int bd) {
+  vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
  }
  
-void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
    DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
    DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
    DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -493,16 +493,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
    DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
    DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
    const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
    __m128i mask, hev, flat;
-  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
    const __m128i one = _mm_set1_epi16(1);
    const __m128i ffff = _mm_cmpeq_epi16(one, one);
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -519,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
    __m128i filter1, filter2;
  
    if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
      t80 = _mm_set1_epi16(0x80);
    } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
      t80 = _mm_set1_epi16(0x200);
    } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
      t80 = _mm_set1_epi16(0x800);
    }
  
@@ -553,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
    abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
    abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
    flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
    hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
  
    abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
    mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
    mask = _mm_max_epi16(abs_p1p0, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    mask = _mm_max_epi16(abs_q1q0, mask);
@@ -576,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
        _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
        _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
    mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
    mask = _mm_cmpeq_epi16(mask, zero);
  
    // flat_mask4
@@ -674,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
    q1 = _mm_and_si128(flat, q1);
    q1 = _mm_or_si128(work_a, q1);
  
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
    q2 = _mm_load_si128((__m128i *)flat_oq2);
    work_a = _mm_andnot_si128(flat, work_a);
    q2 = _mm_and_si128(flat, q2);
@@ -694,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
    p1 = _mm_and_si128(flat, p1);
    p1 = _mm_or_si128(work_a, p1);
  
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
    p2 = _mm_load_si128((__m128i *)flat_op2);
    work_a = _mm_andnot_si128(flat, work_a);
    p2 = _mm_and_si128(flat, p2);
    p2 = _mm_or_si128(work_a, p2);
  
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
  }
  
  void vpx_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
  }
  
-void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
    const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
    __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
    const __m128i abs_p1p0 =
        _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
    const __m128i abs_q1q0 =
@@ -760,33 +760,33 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
    __m128i filter1, filter2;
  
    if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
      t80 = _mm_set1_epi16(0x80);
      tff80 = _mm_set1_epi16(0xff80);
      tffe0 = _mm_set1_epi16(0xffe0);
      t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
      t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
    } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
      t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
      tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
      tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
      t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
      t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
    } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
      t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
      tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
      tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
@@ -794,23 +794,23 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
      t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
    }
  
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
  
    // filter_mask and hev_mask
    flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
    hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
  
    abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
    mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
    mask = _mm_max_epi16(flat, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;
@@ -822,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
        _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
        _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
    mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
    mask = _mm_cmpeq_epi16(mask, zero);
  
    // filter4
@@ -872,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
    p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
                        t80);
  
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
  }
  
  void vpx_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
  }
  
  static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
@@ -998,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
    highbd_transpose(src1, in_p, dest1, out_p, 1);
  }
  
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
    DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
    uint16_t *src[1];
    uint16_t *dst[1];
@@ -1009,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
    src[0] = s - 4;
    dst[0] = t_dst;
  
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
  
    // Loop filtering
    vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1018,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
    dst[0] = s - 4;
  
    // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
  }
  
  void vpx_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
      const uint8_t *thresh1, int bd) {
    DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1030,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
    uint16_t *dst[2];
  
    // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
  
    // Loop filtering
    vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1038,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
    src[0] = t_dst;
    src[1] = t_dst + 8;
    dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
  
    // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
  }
  
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
    DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
    uint16_t *src[1];
    uint16_t *dst[1];
@@ -1055,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
    src[0] = s - 4;
    dst[0] = t_dst;
  
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
  
    // Loop filtering
    vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1064,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
    dst[0] = s - 4;
  
    // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
  }
  
  void vpx_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
      const uint8_t *thresh1, int bd) {
    DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1076,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
    uint16_t *dst[2];
  
    // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
  
    // Loop filtering
    vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1085,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
    src[1] = t_dst + 8;
  
    dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
  
    // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
  }
  
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
    DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
@@ -1104,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
    dst[1] = t_dst + 8 * 8;
  
    // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose(src, pitch, dst, 8, 2);
  
    // Loop filtering
    vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
@@ -1115,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
    dst[1] = s;
  
    // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
+  highbd_transpose(src, 8, dst, pitch, 2);
  }
  
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
                                            const uint8_t *blimit,
                                            const uint8_t *limit,
                                            const uint8_t *thresh, int bd) {
    DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
  
    //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
  
    //  Loop filtering
    vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
                                           thresh, bd);
  
    //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+                       pitch);
  }
diff --git a/vpx_dsp/x86/loopfilter_avx2.c b/vpx_dsp/x86/loopfilter_avx2.c

index 6652a62..85a7314 100644 (file)
--- a/vpx_dsp/x86/loopfilter_avx2.c
+++ b/vpx_dsp/x86/loopfilter_avx2.c
@@ -13,38 +13,38 @@
  #include "./vpx_dsp_rtcd.h"
  #include "vpx_ports/mem.h"
  
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
    __m128i mask, hev, flat, flat2;
    const __m128i zero = _mm_set1_epi16(0);
    const __m128i one = _mm_set1_epi8(1);
    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
    __m128i abs_p1p0;
  
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
  
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
    q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
    q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
    q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
    q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
    p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
    q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
    p0q0 = _mm_shuffle_epi32(q0p0, 78);
  
    {
@@ -59,12 +59,12 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
      abs_p1q1 =
          _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
      flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
      hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
  
      abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
      abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
      mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
      // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
      mask = _mm_max_epu8(abs_p1p0, mask);
@@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
          _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
      mask = _mm_max_epu8(work, mask);
      mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
      mask = _mm_cmpeq_epi8(mask, zero);
    }
  
@@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
        flat = _mm_cmpeq_epi8(flat, zero);
        flat = _mm_and_si128(flat, mask);
  
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
        q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
  
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
        q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
  
        flat2 = _mm_max_epu8(
            _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
            _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
  
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
        q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
  
        work = _mm_max_epu8(
            _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
@@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
      q6p6 = _mm_andnot_si128(flat2, q6p6);
      flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
      q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
  
      q5p5 = _mm_andnot_si128(flat2, q5p5);
      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
      q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
  
      q4p4 = _mm_andnot_si128(flat2, q4p4);
      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
      q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
  
      q3p3 = _mm_andnot_si128(flat2, q3p3);
      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
      q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
  
      q2p2 = _mm_andnot_si128(flat2, q2p2);
      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
      q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
  
      q1p1 = _mm_andnot_si128(flat2, q1p1);
      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
      q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
  
      q0p0 = _mm_andnot_si128(flat2, q0p0);
      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
      q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
    }
  }
  
@@ -367,10 +367,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
    8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
  };
  
-void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
    __m128i mask, hev, flat, flat2;
    const __m128i zero = _mm_set1_epi16(0);
    const __m128i one = _mm_set1_epi8(1);
@@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
        p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
  
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
-  p256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
-  p256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
-  p256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
-  p256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
-  p256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
-  q256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
-  q256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
-  q256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
-  q256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
-  q256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+  p256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+  p256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+  p256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+  p256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+  p256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+  q256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+  q256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+  q256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+  q256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+  q256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
  
    p4 = _mm256_castsi256_si128(p256_4);
    p3 = _mm256_castsi256_si128(p256_3);
@@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
          _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
      __m128i work;
      flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
      hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
  
      abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
      abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
      mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
      // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
      mask = _mm_max_epu8(flat, mask);
@@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
          _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
          _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
      mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
      mask = _mm_cmpeq_epi8(mask, zero);
    }
  
@@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
        flat = _mm_and_si128(flat, mask);
  
        p256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
        q256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
        p5 = _mm256_castsi256_si128(p256_5);
        q5 = _mm256_castsi256_si128(q256_5);
        flat2 = _mm_max_epu8(
@@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
  
        flat2 = _mm_max_epu8(work, flat2);
        p256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
        q256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
        p6 = _mm256_castsi256_si128(p256_6);
        q6 = _mm256_castsi256_si128(q256_6);
        work = _mm_max_epu8(
@@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
        flat2 = _mm_max_epu8(work, flat2);
  
        p256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
        q256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
        p7 = _mm256_castsi256_si128(p256_7);
        q7 = _mm256_castsi256_si128(q256_7);
        work = _mm_max_epu8(
@@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
      p6 = _mm_andnot_si128(flat2, p6);
      flat2_p6 = _mm_and_si128(flat2, flat2_p6);
      p6 = _mm_or_si128(flat2_p6, p6);
-    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+    _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
  
      p5 = _mm_andnot_si128(flat2, p5);
      flat2_p5 = _mm_and_si128(flat2, flat2_p5);
      p5 = _mm_or_si128(flat2_p5, p5);
-    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+    _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
  
      p4 = _mm_andnot_si128(flat2, p4);
      flat2_p4 = _mm_and_si128(flat2, flat2_p4);
      p4 = _mm_or_si128(flat2_p4, p4);
-    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+    _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
  
      p3 = _mm_andnot_si128(flat2, p3);
      flat2_p3 = _mm_and_si128(flat2, flat2_p3);
      p3 = _mm_or_si128(flat2_p3, p3);
-    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+    _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
  
      p2 = _mm_andnot_si128(flat2, p2);
      flat2_p2 = _mm_and_si128(flat2, flat2_p2);
      p2 = _mm_or_si128(flat2_p2, p2);
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
  
      p1 = _mm_andnot_si128(flat2, p1);
      flat2_p1 = _mm_and_si128(flat2, flat2_p1);
      p1 = _mm_or_si128(flat2_p1, p1);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
  
      p0 = _mm_andnot_si128(flat2, p0);
      flat2_p0 = _mm_and_si128(flat2, flat2_p0);
      p0 = _mm_or_si128(flat2_p0, p0);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
  
      q0 = _mm_andnot_si128(flat2, q0);
      flat2_q0 = _mm_and_si128(flat2, flat2_q0);
      q0 = _mm_or_si128(flat2_q0, q0);
-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
  
      q1 = _mm_andnot_si128(flat2, q1);
      flat2_q1 = _mm_and_si128(flat2, flat2_q1);
      q1 = _mm_or_si128(flat2_q1, q1);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
  
      q2 = _mm_andnot_si128(flat2, q2);
      flat2_q2 = _mm_and_si128(flat2, flat2_q2);
      q2 = _mm_or_si128(flat2_q2, q2);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
  
      q3 = _mm_andnot_si128(flat2, q3);
      flat2_q3 = _mm_and_si128(flat2, flat2_q3);
      q3 = _mm_or_si128(flat2_q3, q3);
-    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+    _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
  
      q4 = _mm_andnot_si128(flat2, q4);
      flat2_q4 = _mm_and_si128(flat2, flat2_q4);
      q4 = _mm_or_si128(flat2_q4, q4);
-    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+    _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
  
      q5 = _mm_andnot_si128(flat2, q5);
      flat2_q5 = _mm_and_si128(flat2, flat2_q5);
      q5 = _mm_or_si128(flat2_q5, q5);
-    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+    _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
  
      q6 = _mm_andnot_si128(flat2, q6);
      flat2_q6 = _mm_and_si128(flat2, flat2_q6);
      q6 = _mm_or_si128(flat2_q6, q6);
-    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+    _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
    }
  }
diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c

index 853c4d2..20dcb0d 100644 (file)
--- a/vpx_dsp/x86/loopfilter_sse2.c
+++ b/vpx_dsp/x86/loopfilter_sse2.c
@@ -31,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
      /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
      hev =                                                                     \
          _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_cmpgt_epi16(hev, thresh_v);                                     \
      hev = _mm_packs_epi16(hev, hev);                                          \
                                                                                \
      /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
@@ -52,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
      flat = _mm_max_epu8(work, flat);                                          \
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
      mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_subs_epu8(mask, limit_v);                                      \
      mask = _mm_cmpeq_epi8(mask, zero);                                        \
      mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
    } while (0)
@@ -104,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
      ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
    } while (0)
  
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
    const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
    __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
    __m128i mask, hev;
  
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
    p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
    p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
    q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
@@ -133,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
    FILTER_HEV_MASK;
    FILTER4;
  
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+  _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0);               // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0);               // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0));  // *oq1
  }
  
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
    const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
    __m128i x0, x1, x2, x3;
    __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
    __m128i mask, hev;
  
    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
  
    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
  
    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
  
    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
  
    // Transpose 8x8
    // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
@@ -213,52 +211,52 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
    // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
    ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
  
-  storeu_uint32(s + 0 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
    ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 1 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
    ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 2 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
    ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 3 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
  
-  storeu_uint32(s + 4 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
    qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 5 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
    qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 6 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
    qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 7 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
  }
  
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
    const __m128i zero = _mm_set1_epi16(0);
    const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
    __m128i mask, hev, flat, flat2;
    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
    __m128i abs_p1p0;
  
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
    q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
    q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
    q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
    q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
    p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
    q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
    p0q0 = _mm_shuffle_epi32(q0p0, 78);
  
    {
@@ -270,12 +268,12 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
      abs_p0q0 = abs_diff(q0p0, p0q0);
      abs_p1q1 = abs_diff(q1p1, p1q1);
      flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
      hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
  
      abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
      abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
      mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
      // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
      mask = _mm_max_epu8(abs_p1p0, mask);
@@ -285,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
      work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
      mask = _mm_max_epu8(work, mask);
      mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
      mask = _mm_cmpeq_epi8(mask, zero);
    }
  
@@ -343,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
        flat = _mm_cmpeq_epi8(flat, zero);
        flat = _mm_and_si128(flat, mask);
  
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
        q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
  
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
        q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
        flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
  
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
        q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
        work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
        flat2 = _mm_max_epu8(work, flat2);
        flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
@@ -521,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
      q6p6 = _mm_andnot_si128(flat2, q6p6);
      flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
      q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
  
      q5p5 = _mm_andnot_si128(flat2, q5p5);
      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
      q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
  
      q4p4 = _mm_andnot_si128(flat2, q4p4);
      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
      q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
  
      q3p3 = _mm_andnot_si128(flat2, q3p3);
      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
      q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
  
      q2p2 = _mm_andnot_si128(flat2, q2p2);
      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
      q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
  
      q1p1 = _mm_andnot_si128(flat2, q1p1);
      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
      q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
  
      q0p0 = _mm_andnot_si128(flat2, q0p0);
      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
      q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
    }
  }
  
@@ -592,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
    return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
  }
  
-void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
    const __m128i zero = _mm_set1_epi16(0);
    const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
    __m128i mask, hev, flat, flat2;
    __m128i p7, p6, p5;
    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
@@ -610,22 +608,22 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
  
    __m128i max_abs_p1p0q1q0;
  
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
  
    {
      const __m128i abs_p1p0 = abs_diff(p1, p0);
@@ -639,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
  
      abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
      abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
      mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
      // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
      mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
@@ -649,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
      mask = _mm_max_epu8(work, mask);
      work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
      mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
      mask = _mm_cmpeq_epi8(mask, zero);
    }
  
@@ -695,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
      oq0 = _mm_xor_si128(q0, t80);
      oq1 = _mm_xor_si128(q1, t80);
  
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
      hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
      filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
  
@@ -852,82 +850,82 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
        f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
  
        p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+      _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
  
        f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
        p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+      _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
  
        f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
        p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+      _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
  
        f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
        p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+      _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
  
        f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
        op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+      _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
  
        f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
        op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
  
        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
        op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
        oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
        oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+      _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
        oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
        q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+      _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
        q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+      _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
        q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
  
        f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
        f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
        q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+      _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
      }
      // wide flat
      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    }
  }
  
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+                               const unsigned char *blimit,
+                               const unsigned char *limit,
+                               const unsigned char *thresh) {
    DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
    DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
    DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -935,21 +933,21 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
    DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
    DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
    const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
    __m128i mask, hev, flat;
    __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
  
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
    p1q1 = _mm_shuffle_epi32(q1p1, 78);
    p0q0 = _mm_shuffle_epi32(q0p0, 78);
  
@@ -965,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
      abs_p0q0 = abs_diff(q0p0, p0q0);
      abs_p1q1 = abs_diff(q1p1, p1q1);
      flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
      hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
  
      abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
      abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
      mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
      // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
      mask = _mm_max_epu8(abs_p1p0, mask);
@@ -980,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
      work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
      mask = _mm_max_epu8(work, mask);
      mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
      mask = _mm_cmpeq_epi8(mask, zero);
  
      // flat_mask4
@@ -998,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
      unsigned char *src = s;
      {
        __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
  
        workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
        workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1051,13 +1057,13 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
      const __m128i t80 = _mm_set1_epi8(0x80);
      const __m128i t1 = _mm_set1_epi8(0x1);
      const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
      const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
      const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
      const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
      __m128i filt;
      __m128i work_a;
      __m128i filter1, filter2;
@@ -1103,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
      q1 = _mm_and_si128(flat, q1);
      q1 = _mm_or_si128(work_a, q1);
  
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
      q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
      work_a = _mm_andnot_si128(flat, work_a);
      q2 = _mm_and_si128(flat, q2);
@@ -1121,27 +1127,25 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
      p1 = _mm_and_si128(flat, p1);
      p1 = _mm_or_si128(work_a, p1);
  
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
      p2 = _mm_loadl_epi64((__m128i *)flat_op2);
      work_a = _mm_andnot_si128(flat, work_a);
      p2 = _mm_and_si128(flat, p2);
      p2 = _mm_or_si128(work_a, p2);
  
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
    }
  }
  
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
+void vpx_lpf_horizontal_8_dual_sse2(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
    DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
    DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
    DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -1150,26 +1154,26 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
    DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
    const __m128i zero = _mm_set1_epi16(0);
    const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
    const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
    const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
  
    __m128i mask, hev, flat;
    __m128i p3, p2, p1, p0, q0, q1, q2, q3;
  
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
    {
      const __m128i abs_p1p0 =
          _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
@@ -1228,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
  
      do {
        __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
  
        workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
        workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1287,13 +1299,13 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
      const __m128i t7f = _mm_set1_epi8(0x7f);
  
      const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
      const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
      const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
      const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
      __m128i filt;
      __m128i work_a;
      __m128i filter1, filter2;
@@ -1345,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
      q1 = _mm_and_si128(flat, q1);
      q1 = _mm_or_si128(work_a, q1);
  
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
      q2 = _mm_load_si128((__m128i *)flat_oq2);
      work_a = _mm_andnot_si128(flat, work_a);
      q2 = _mm_and_si128(flat, q2);
@@ -1363,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
      p1 = _mm_and_si128(flat, p1);
      p1 = _mm_or_si128(work_a, p1);
  
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
      p2 = _mm_load_si128((__m128i *)flat_op2);
      work_a = _mm_andnot_si128(flat, work_a);
      p2 = _mm_and_si128(flat, p2);
      p2 = _mm_or_si128(work_a, p2);
  
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
    }
  }
  
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+                                    const unsigned char *blimit0,
+                                    const unsigned char *limit0,
+                                    const unsigned char *thresh0,
+                                    const unsigned char *blimit1,
+                                    const unsigned char *limit1,
+                                    const unsigned char *thresh1) {
    const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
    const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
    const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
    const __m128i zero = _mm_set1_epi16(0);
    __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    __m128i mask, hev, flat;
  
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
  
    // filter_mask and hev_mask
    {
@@ -1456,13 +1468,13 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
      const __m128i t7f = _mm_set1_epi8(0x7f);
  
      const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
      const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
      const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
      const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
      __m128i filt;
      __m128i work_a;
      __m128i filter1, filter2;
@@ -1507,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
      p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
      p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
  
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
    }
  }
  
@@ -1650,7 +1662,7 @@ static INLINE void transpose(unsigned char *src[], int in_p,
    } while (++idx8x8 < num_8x8_to_transpose);
  }
  
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                    const uint8_t *limit0, const uint8_t *thresh0,
                                    const uint8_t *blimit1, const uint8_t *limit1,
                                    const uint8_t *thresh1) {
@@ -1659,7 +1671,7 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    unsigned char *dst[2];
  
    // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
  
    // Loop filtering
    vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
@@ -1667,13 +1679,13 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    src[0] = t_dst;
    src[1] = t_dst + 8;
    dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
  
    // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
  }
  
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1685,7 +1697,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
    src[0] = s - 4;
    dst[0] = t_dst;
  
-  transpose(src, p, dst, 8, 1);
+  transpose(src, pitch, dst, 8, 1);
  
    // Loop filtering
    vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
@@ -1694,10 +1706,10 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
    dst[0] = s - 4;
  
    // Transpose back
-  transpose(src, 8, dst, p, 1);
+  transpose(src, 8, dst, pitch, 1);
  }
  
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                    const uint8_t *limit0, const uint8_t *thresh0,
                                    const uint8_t *blimit1, const uint8_t *limit1,
                                    const uint8_t *thresh1) {
@@ -1706,7 +1718,7 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    unsigned char *dst[2];
  
    // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
  
    // Loop filtering
    vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
@@ -1715,13 +1727,13 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    src[1] = t_dst + 8;
  
    dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
  
    // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
  }
  
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
                                const unsigned char *blimit,
                                const unsigned char *limit,
                                const unsigned char *thresh) {
@@ -1735,7 +1747,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
    dst[1] = t_dst + 8 * 8;
  
    // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose(src, pitch, dst, 8, 2);
  
    // Loop filtering
    vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
@@ -1746,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
    dst[1] = s;
  
    // Transpose back
-  transpose(src, 8, dst, p, 2);
+  transpose(src, 8, dst, pitch, 2);
  }
  
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
                                     const uint8_t *blimit, const uint8_t *limit,
                                     const uint8_t *thresh) {
    DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
  
    // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
  
    // Loop filtering
    vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
  
    // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
  }
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c

index 6f44890..692cf59 100644 (file)
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -24,8 +24,8 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan_ptr,
-                        const int16_t *iscan_ptr) {
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
    const __m256i big_zero = _mm256_setzero_si256();
    int index;
@@ -37,7 +37,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    __m128i all_zero;
    __m128i eob = zero, eob0;
  
-  (void)scan_ptr;
+  (void)scan;
    (void)skip_block;
    assert(!skip_block);
  
@@ -97,8 +97,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      store_tran_low(coeff0, dqcoeff_ptr);
      store_tran_low(coeff1, dqcoeff_ptr + 8);
  
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
    }
  
    // AC only loop.
@@ -141,20 +140,22 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      store_tran_low(coeff0, dqcoeff_ptr + index);
      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
      eob = _mm_max_epi16(eob, eob0);
    }
  
    *eob_ptr = accumulate_eob(eob);
  }
  
-void vpx_quantize_b_32x32_avx(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i one = _mm_set1_epi16(1);
    const __m256i big_zero = _mm256_setzero_si256();
@@ -167,7 +168,7 @@ void vpx_quantize_b_32x32_avx(
    __m128i all_zero;
    __m128i eob = zero, eob0;
  
-  (void)scan_ptr;
+  (void)scan;
    (void)n_coeffs;
    (void)skip_block;
    assert(!skip_block);
@@ -253,8 +254,7 @@ void vpx_quantize_b_32x32_avx(
      store_tran_low(coeff0, dqcoeff_ptr);
      store_tran_low(coeff1, dqcoeff_ptr + 8);
  
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
    }
  
    // AC only loop.
@@ -306,8 +306,8 @@ void vpx_quantize_b_32x32_avx(
      store_tran_low(coeff0, dqcoeff_ptr + index);
      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
      eob = _mm_max_epi16(eob, eob0);
    }
  
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c

index c020b39..327fb05 100644 (file)
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -22,8 +22,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
    int index = 16;
  
@@ -33,7 +33,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    __m128i cmp_mask0, cmp_mask1;
    __m128i eob, eob0;
  
-  (void)scan_ptr;
+  (void)scan;
    (void)skip_block;
    assert(!skip_block);
  
@@ -81,8 +81,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    store_tran_low(coeff0, dqcoeff_ptr);
    store_tran_low(coeff1, dqcoeff_ptr + 8);
  
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
  
    // AC only loop.
    while (index < n_coeffs) {
@@ -115,8 +114,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      store_tran_low(coeff0, dqcoeff_ptr + index);
      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
      eob = _mm_max_epi16(eob, eob0);
  
      index += 16;
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c

index 3f528e1..d8b6dc7 100644 (file)
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -22,7 +22,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                            const int16_t *quant_shift_ptr,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+                          const int16_t *scan, const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
    int index = 16;
  
@@ -32,7 +32,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    __m128i cmp_mask0, cmp_mask1;
    __m128i eob, eob0;
  
-  (void)scan_ptr;
+  (void)scan;
    (void)skip_block;
    assert(!skip_block);
  
@@ -74,8 +74,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    store_tran_low(coeff0, dqcoeff_ptr);
    store_tran_low(coeff1, dqcoeff_ptr + 8);
  
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
  
    // AC only loop.
    while (index < n_coeffs) {
@@ -106,8 +105,8 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      store_tran_low(coeff0, dqcoeff_ptr + index);
      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
      eob = _mm_max_epi16(eob, eob0);
  
      index += 16;
@@ -116,12 +115,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    *eob_ptr = accumulate_eob(eob);
  }
  
-void vpx_quantize_b_32x32_ssse3(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i one = _mm_set1_epi16(1);
    int index;
@@ -133,7 +134,7 @@ void vpx_quantize_b_32x32_ssse3(
    __m128i all_zero;
    __m128i eob = zero, eob0;
  
-  (void)scan_ptr;
+  (void)scan;
    (void)n_coeffs;
    (void)skip_block;
    assert(!skip_block);
@@ -226,8 +227,7 @@ void vpx_quantize_b_32x32_ssse3(
      store_tran_low(coeff0, dqcoeff_ptr);
      store_tran_low(coeff1, dqcoeff_ptr + 8);
  
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
    }
  
    // AC only loop.
@@ -283,8 +283,8 @@ void vpx_quantize_b_32x32_ssse3(
      store_tran_low(coeff0, dqcoeff_ptr + index);
      store_tran_low(coeff1, dqcoeff_ptr + index + 8);
  
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
      eob = _mm_max_epi16(eob, eob0);
    }
  
diff --git a/vpx_dsp/x86/quantize_x86.h b/vpx_dsp/x86/quantize_x86.h

index bb9e32f..a6e2f27 100644 (file)
--- a/vpx_dsp/x86/quantize_x86.h
+++ b/vpx_dsp/x86/quantize_x86.h
@@ -48,17 +48,17 @@ static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
    return _mm_mullo_epi16(qcoeff, dequant);
  }
  
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
+// zbin to add 1 to the index in 'scan'.
  static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
                                     const __m128i zbin_mask0,
                                     const __m128i zbin_mask1,
-                                   const int16_t *scan_ptr, const int index,
+                                   const int16_t *scan, const int index,
                                     const __m128i zero) {
    const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
    const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
-  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
-  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
    __m128i eob0, eob1;
    // Add one to convert from indices to counts
    scan0 = _mm_sub_epi16(scan0, zbin_mask0);
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c

index 2c6b36f..b18fecf 100644 (file)
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -12,26 +12,26 @@
  #include "vpx/vpx_integer.h"
  
  static INLINE void calc_final(const __m256i *const sums /*[4]*/,
-                              uint32_t res[4]) {
+                              uint32_t sad_array[4]) {
    const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
    const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
    const __m256i t2 = _mm256_hadd_epi32(t0, t1);
    const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
                                      _mm256_extractf128_si256(t2, 1));
-  _mm_storeu_si128((__m128i *)res, sum);
+  _mm_storeu_si128((__m128i *)sad_array, sum);
  }
  
-void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
    int i;
    const uint8_t *refs[4];
    __m256i sums[4];
  
-  refs[0] = ref[0];
-  refs[1] = ref[1];
-  refs[2] = ref[2];
-  refs[3] = ref[3];
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
    sums[0] = _mm256_setzero_si256();
    sums[1] = _mm256_setzero_si256();
    sums[2] = _mm256_setzero_si256();
@@ -40,46 +40,46 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
    for (i = 0; i < 32; i++) {
      __m256i r[4];
  
-    // load src and all refs
-    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
      r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
      r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
      r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
      r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
  
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
      r[0] = _mm256_sad_epu8(r[0], s);
      r[1] = _mm256_sad_epu8(r[1], s);
      r[2] = _mm256_sad_epu8(r[2], s);
      r[3] = _mm256_sad_epu8(r[3], s);
  
-    // sum every ref-i
+    // sum every ref[]
      sums[0] = _mm256_add_epi32(sums[0], r[0]);
      sums[1] = _mm256_add_epi32(sums[1], r[1]);
      sums[2] = _mm256_add_epi32(sums[2], r[2]);
      sums[3] = _mm256_add_epi32(sums[3], r[3]);
  
-    src += src_stride;
+    src_ptr += src_stride;
      refs[0] += ref_stride;
      refs[1] += ref_stride;
      refs[2] += ref_stride;
      refs[3] += ref_stride;
    }
  
-  calc_final(sums, res);
+  calc_final(sums, sad_array);
  }
  
-void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
    __m256i sums[4];
    int i;
    const uint8_t *refs[4];
  
-  refs[0] = ref[0];
-  refs[1] = ref[1];
-  refs[2] = ref[2];
-  refs[3] = ref[3];
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
    sums[0] = _mm256_setzero_si256();
    sums[1] = _mm256_setzero_si256();
    sums[2] = _mm256_setzero_si256();
@@ -87,9 +87,9 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
  
    for (i = 0; i < 64; i++) {
      __m256i r_lo[4], r_hi[4];
-    // load 64 bytes from src and all refs
-    const __m256i s_lo = _mm256_load_si256((const __m256i *)src);
-    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src + 32));
+    // load 64 bytes from src and all ref[]
+    const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
      r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
      r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
      r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
@@ -99,7 +99,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
      r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
      r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
  
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
      r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
      r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
      r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
@@ -109,7 +109,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
      r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
      r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
  
-    // sum every ref-i
+    // sum every ref[]
      sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
      sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
      sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
@@ -119,12 +119,12 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
      sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
      sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
  
-    src += src_stride;
+    src_ptr += src_stride;
      refs[0] += ref_stride;
      refs[1] += ref_stride;
      refs[2] += ref_stride;
      refs[3] += ref_stride;
    }
  
-  calc_final(sums, res);
+  calc_final(sums, sad_array);
  }
diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c

index 5f2ab6e..4c5d704 100644 (file)
--- a/vpx_dsp/x86/sad4d_avx512.c
+++ b/vpx_dsp/x86/sad4d_avx512.c
@@ -11,8 +11,8 @@
  #include "./vpx_dsp_rtcd.h"
  #include "vpx/vpx_integer.h"
  
-void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
                              uint32_t res[4]) {
    __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
    __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
@@ -20,33 +20,33 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
    int i;
    const uint8_t *ref0, *ref1, *ref2, *ref3;
  
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+  ref0 = ref_array[0];
+  ref1 = ref_array[1];
+  ref2 = ref_array[2];
+  ref3 = ref_array[3];
    sum_ref0 = _mm512_set1_epi16(0);
    sum_ref1 = _mm512_set1_epi16(0);
    sum_ref2 = _mm512_set1_epi16(0);
    sum_ref3 = _mm512_set1_epi16(0);
    for (i = 0; i < 64; i++) {
-    // load src and all refs
-    src_reg = _mm512_loadu_si512((const __m512i *)src);
+    // load src and all ref[]
+    src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
      ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
      ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
      ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
      ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
      ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
      ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
      ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
      ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
+    // sum every ref[]
      sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
      sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
      sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
      sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
  
-    src += src_stride;
+    src_ptr += src_stride;
      ref0 += ref_stride;
      ref1 += ref_stride;
      ref2 += ref_stride;
@@ -55,7 +55,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
    {
      __m256i sum256;
      __m128i sum128;
-    // in sum_ref-i the result is saved in the first 4 bytes
+    // in sum_ref[] the result is saved in the first 4 bytes
      // the other 4 bytes are zeroed.
      // sum_ref1 and sum_ref3 are shifted left by 4 bytes
      sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
@@ -65,7 +65,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
      sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
      sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
  
-    // merge every 64 bit from each sum_ref-i
+    // merge every 64 bit from each sum_ref[]
      sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
      sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
  
diff --git a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm

index d83507d..e6e72b8 100644 (file)
--- a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -45,7 +45,7 @@
  
      ;Compute max and min values of a pixel
      mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movsxd      rcx, DWORD PTR arg(6)      ;bd
      movq        xmm0, rdx
      movq        xmm1, rcx
      pshufd      xmm0, xmm0, 0b
@@ -121,7 +121,7 @@
  
      ;Compute max and min values of a pixel
      mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
      movq        xmm0, rdx
      movq        xmm1, rcx
      pshufd      xmm0, xmm0, 0b
diff --git a/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm

index 9bffe50..87bf75e 100644 (file)
--- a/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -26,7 +26,7 @@
      pshufd      xmm3, xmm3, 0
  
      mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
      movq        xmm5, rdx
      movq        xmm2, rcx
      pshufd      xmm5, xmm5, 0b
@@ -82,7 +82,7 @@
      pshufd      xmm4, xmm4, 0
  
      mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
      movq        xmm8, rdx
      movq        xmm5, rcx
      pshufd      xmm8, xmm8, 0b
author	Johann <johann.koenig@duck.com>
	Tue, 30 Oct 2018 19:59:46 +0000 (12:59 -0700)
committer	Johann <johann.koenig@duck.com>
	Thu, 1 Nov 2018 19:14:14 +0000 (12:14 -0700)
vpx_dsp/arm/quantize_neon.c		patch \| blob \| history
vpx_dsp/arm/sad4d_neon.c		patch \| blob \| history
vpx_dsp/arm/sad_neon.c		patch \| blob \| history
vpx_dsp/bitwriter.h		patch \| blob \| history
vpx_dsp/deblock.c		patch \| blob \| history
vpx_dsp/fwd_txfm.c		patch \| blob \| history
vpx_dsp/inv_txfm.c		patch \| blob \| history
vpx_dsp/loopfilter.c		patch \| blob \| history
vpx_dsp/mips/deblock_msa.c		patch \| blob \| history
vpx_dsp/quantize.c		patch \| blob \| history
vpx_dsp/quantize.h		patch \| blob \| history
vpx_dsp/sad.c		patch \| blob \| history
vpx_dsp/subtract.c		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/avg_pred_sse2.c		patch \| blob \| history
vpx_dsp/x86/convolve.h		patch \| blob \| history
vpx_dsp/x86/highbd_convolve_avx2.c		patch \| blob \| history
vpx_dsp/x86/highbd_intrapred_sse2.asm		patch \| blob \| history
vpx_dsp/x86/highbd_loopfilter_sse2.c		patch \| blob \| history
vpx_dsp/x86/loopfilter_avx2.c		patch \| blob \| history
vpx_dsp/x86/loopfilter_sse2.c		patch \| blob \| history
vpx_dsp/x86/quantize_avx.c		patch \| blob \| history
vpx_dsp/x86/quantize_sse2.c		patch \| blob \| history
vpx_dsp/x86/quantize_ssse3.c		patch \| blob \| history
vpx_dsp/x86/quantize_x86.h		patch \| blob \| history
vpx_dsp/x86/sad4d_avx2.c		patch \| blob \| history
vpx_dsp/x86/sad4d_avx512.c		patch \| blob \| history
vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm		patch \| blob \| history
vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm		patch \| blob \| history