#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
+ vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride, filter_x,
filter_x_stride, filter_y, filter_y_stride,
w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_sse2(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 8);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 10);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_copy_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
const int16_t *filter_y,
int filter_y_stride,
int w, int h) {
- vp9_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_highbd_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, filter_x_stride,
filter_y, filter_y_stride, w, h, 12);
}
#else
const ConvolveFunctions convolve8_c(
- vp9_convolve_copy_c, vp9_convolve_avg_c,
- vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
- vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
- vp9_convolve8_c, vp9_convolve8_avg_c, 0);
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
+ vpx_convolve8_horiz_c, vpx_convolve8_avg_horiz_c,
+ vpx_convolve8_vert_c, vpx_convolve8_avg_vert_c,
+ vpx_convolve8_c, vpx_convolve8_avg_c, 0);
INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_c),
#else
const ConvolveFunctions convolve8_sse2(
#if CONFIG_USE_X86INC
- vp9_convolve_copy_sse2, vp9_convolve_avg_sse2,
+ vpx_convolve_copy_sse2, vpx_convolve_avg_sse2,
#else
- vp9_convolve_copy_c, vp9_convolve_avg_c,
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
#endif // CONFIG_USE_X86INC
- vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
- vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
- vp9_convolve8_sse2, vp9_convolve8_avg_sse2, 0);
+ vpx_convolve8_horiz_sse2, vpx_convolve8_avg_horiz_sse2,
+ vpx_convolve8_vert_sse2, vpx_convolve8_avg_vert_sse2,
+ vpx_convolve8_sse2, vpx_convolve8_avg_sse2, 0);
INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_sse2),
#if HAVE_SSSE3
const ConvolveFunctions convolve8_ssse3(
- vp9_convolve_copy_c, vp9_convolve_avg_c,
- vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
- vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
- vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3, 0);
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
+ vpx_convolve8_horiz_ssse3, vpx_convolve8_avg_horiz_ssse3,
+ vpx_convolve8_vert_ssse3, vpx_convolve8_avg_vert_ssse3,
+ vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3, 0);
INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_ssse3),
#if HAVE_AVX2 && HAVE_SSSE3
const ConvolveFunctions convolve8_avx2(
- vp9_convolve_copy_c, vp9_convolve_avg_c,
- vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
- vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
- vp9_convolve8_avx2, vp9_convolve8_avg_ssse3, 0);
+ vpx_convolve_copy_c, vpx_convolve_avg_c,
+ vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3,
+ vpx_convolve8_vert_avx2, vpx_convolve8_avg_vert_ssse3,
+ vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, 0);
INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_avx2),
#if HAVE_NEON
#if HAVE_NEON_ASM
const ConvolveFunctions convolve8_neon(
- vp9_convolve_copy_neon, vp9_convolve_avg_neon,
- vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
- vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
- vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+ vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+ vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+ vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+ vpx_convolve8_neon, vpx_convolve8_avg_neon, 0);
#else // HAVE_NEON
const ConvolveFunctions convolve8_neon(
- vp9_convolve_copy_neon, vp9_convolve_avg_neon,
- vp9_convolve8_horiz_neon, vp9_convolve8_avg_horiz_neon,
- vp9_convolve8_vert_neon, vp9_convolve8_avg_vert_neon,
- vp9_convolve8_neon, vp9_convolve8_avg_neon, 0);
+ vpx_convolve_copy_neon, vpx_convolve_avg_neon,
+ vpx_convolve8_horiz_neon, vpx_convolve8_avg_horiz_neon,
+ vpx_convolve8_vert_neon, vpx_convolve8_avg_vert_neon,
+ vpx_convolve8_neon, vpx_convolve8_avg_neon, 0);
#endif // HAVE_NEON_ASM
INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
#if HAVE_DSPR2
const ConvolveFunctions convolve8_dspr2(
- vp9_convolve_copy_dspr2, vp9_convolve_avg_dspr2,
- vp9_convolve8_horiz_dspr2, vp9_convolve8_avg_horiz_dspr2,
- vp9_convolve8_vert_dspr2, vp9_convolve8_avg_vert_dspr2,
- vp9_convolve8_dspr2, vp9_convolve8_avg_dspr2, 0);
+ vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
+ vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
+ vpx_convolve8_vert_dspr2, vpx_convolve8_avg_vert_dspr2,
+ vpx_convolve8_dspr2, vpx_convolve8_avg_dspr2, 0);
INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_dspr2),
#if HAVE_MSA
const ConvolveFunctions convolve8_msa(
- vp9_convolve_copy_msa, vp9_convolve_avg_msa,
- vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_msa,
- vp9_convolve8_vert_msa, vp9_convolve8_avg_vert_msa,
- vp9_convolve8_msa, vp9_convolve8_avg_msa, 0);
+ vpx_convolve_copy_msa, vpx_convolve_avg_msa,
+ vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
+ vpx_convolve8_vert_msa, vpx_convolve8_avg_vert_msa,
+ vpx_convolve8_msa, vpx_convolve8_avg_msa, 0);
INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
make_tuple(4, 4, &convolve8_msa),
+++ /dev/null
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
- int32x4_t qdst;
- int16x4_t d0s16, d1s16;
-
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
-
- qdst = vmull_lane_s16(dsrc0, d0s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
- qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
- return qdst;
-}
-
-void vp9_convolve8_avg_horiz_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
- int width;
- uint8_t *s, *d;
- uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
- uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
- uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
- uint16x8x2_t q0x2u16;
- uint8x8x2_t d0x2u8, d1x2u8;
- uint32x2x2_t d0x2u32;
- uint16x4x2_t d0x2u16, d1x2u16;
- uint32x4x2_t q0x2u32;
-
- if (x_step_q4 != 16) {
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- q0s16 = vld1q_s16(filter_x);
-
- src -= 3; // adjust for taps
- for (; h > 0; h -= 4) { // loop_horiz_v
- s = src;
- d24u8 = vld1_u8(s);
- s += src_stride;
- d25u8 = vld1_u8(s);
- s += src_stride;
- d26u8 = vld1_u8(s);
- s += src_stride;
- d27u8 = vld1_u8(s);
-
- q12u8 = vcombine_u8(d24u8, d25u8);
- q13u8 = vcombine_u8(d26u8, d27u8);
-
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
- d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
- d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
- d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
- d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
- d0x2u8 = vtrn_u8(d24u8, d25u8);
- d1x2u8 = vtrn_u8(d26u8, d27u8);
-
- __builtin_prefetch(src + src_stride * 4);
- __builtin_prefetch(src + src_stride * 5);
-
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
- q10u16 = vmovl_u8(d1x2u8.val[0]);
- q11u16 = vmovl_u8(d1x2u8.val[1]);
-
- src += 7;
- d16u16 = vget_low_u16(q8u16);
- d17u16 = vget_high_u16(q8u16);
- d18u16 = vget_low_u16(q9u16);
- d19u16 = vget_high_u16(q9u16);
- q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
- q9u16 = vcombine_u16(d17u16, d19u16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w;
- width > 0;
- width -= 4, src += 4, dst += 4) { // loop_horiz
- s = src;
- d28u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d29u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d31u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d30u32 = vld1_dup_u32((const uint32_t *)s);
-
- __builtin_prefetch(src + 64);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
- vreinterpret_u8_u16(d1x2u16.val[0])); // d29
- d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
- vreinterpret_u8_u16(d1x2u16.val[1])); // d30
-
- __builtin_prefetch(src + 64 + src_stride);
-
- q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
-
- d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
- d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
- q12u16 = vmovl_u8(d28u8);
- q13u16 = vmovl_u8(d29u8);
-
- __builtin_prefetch(src + 64 + src_stride * 2);
-
- d = dst;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
- d += dst_stride;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- __builtin_prefetch(src + 64 + src_stride * 3);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
- d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
- vreinterpret_u32_u16(d0x2u16.val[1]));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
- vreinterpret_u8_u32(d0x2u32.val[1]));
-
- q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
- d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
- d = dst;
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
- q8u16 = q9u16;
- d20s16 = d23s16;
- q11u16 = q12u16;
- q9u16 = q13u16;
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- }
- src += src_stride * 4 - w - 7;
- dst += dst_stride * 4 - w;
- }
- return;
-}
-
-void vp9_convolve8_avg_vert_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
- int height;
- uint8_t *s, *d;
- uint8x8_t d2u8, d3u8;
- uint32x2_t d2u32, d3u32, d6u32, d7u32;
- uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
- uint8x16_t q1u8, q3u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
-
- if (y_step_q4 != 16) {
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- src -= src_stride * 3;
- q0s16 = vld1q_s16(filter_y);
- for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
- s = src;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
- s += src_stride;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
- s += src_stride;
- d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
- s += src_stride;
-
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
- q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
- q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d = dst;
- for (height = h; height > 0; height -= 4) { // loop_vert
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
- s += src_stride;
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
- s += src_stride;
-
- q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
- q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
- d += dst_stride;
- d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
- d += dst_stride;
- d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
- d -= dst_stride * 3;
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- __builtin_prefetch(s);
- __builtin_prefetch(s + src_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
- __builtin_prefetch(s + src_stride * 2);
- __builtin_prefetch(s + src_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
- __builtin_prefetch(d);
- __builtin_prefetch(d + dst_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
- __builtin_prefetch(d + dst_stride * 2);
- __builtin_prefetch(d + dst_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- q1u8 = vcombine_u8(d2u8, d3u8);
- q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
-
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
- d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
-
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
- d += dst_stride;
-
- q8u16 = q10u16;
- d18s16 = d22s16;
- d19s16 = d24s16;
- q10u16 = q13u16;
- d22s16 = d25s16;
- }
- }
- return;
-}
+++ /dev/null
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
-
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-static INLINE int32x4_t MULTIPLY_BY_Q0(
- int16x4_t dsrc0,
- int16x4_t dsrc1,
- int16x4_t dsrc2,
- int16x4_t dsrc3,
- int16x4_t dsrc4,
- int16x4_t dsrc5,
- int16x4_t dsrc6,
- int16x4_t dsrc7,
- int16x8_t q0s16) {
- int32x4_t qdst;
- int16x4_t d0s16, d1s16;
-
- d0s16 = vget_low_s16(q0s16);
- d1s16 = vget_high_s16(q0s16);
-
- qdst = vmull_lane_s16(dsrc0, d0s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
- qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
- qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
- qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
- qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
- return qdst;
-}
-
-void vp9_convolve8_horiz_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x,
- int x_step_q4,
- const int16_t *filter_y, // unused
- int y_step_q4, // unused
- int w,
- int h) {
- int width;
- uint8_t *s, *d, *psrc, *pdst;
- uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
- uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
- uint8x16_t q12u8, q13u8, q14u8, q15u8;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
- uint16x8x2_t q0x2u16;
- uint8x8x2_t d0x2u8, d1x2u8;
- uint32x2x2_t d0x2u32;
- uint16x4x2_t d0x2u16, d1x2u16;
- uint32x4x2_t q0x2u32;
-
- if (x_step_q4 != 16) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- q0s16 = vld1q_s16(filter_x);
-
- src -= 3; // adjust for taps
- for (; h > 0; h -= 4,
- src += src_stride * 4,
- dst += dst_stride * 4) { // loop_horiz_v
- s = src;
- d24u8 = vld1_u8(s);
- s += src_stride;
- d25u8 = vld1_u8(s);
- s += src_stride;
- d26u8 = vld1_u8(s);
- s += src_stride;
- d27u8 = vld1_u8(s);
-
- q12u8 = vcombine_u8(d24u8, d25u8);
- q13u8 = vcombine_u8(d26u8, d27u8);
-
- q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
- vreinterpretq_u16_u8(q13u8));
- d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
- d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
- d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
- d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
- d0x2u8 = vtrn_u8(d24u8, d25u8);
- d1x2u8 = vtrn_u8(d26u8, d27u8);
-
- __builtin_prefetch(src + src_stride * 4);
- __builtin_prefetch(src + src_stride * 5);
- __builtin_prefetch(src + src_stride * 6);
-
- q8u16 = vmovl_u8(d0x2u8.val[0]);
- q9u16 = vmovl_u8(d0x2u8.val[1]);
- q10u16 = vmovl_u8(d1x2u8.val[0]);
- q11u16 = vmovl_u8(d1x2u8.val[1]);
-
- d16u16 = vget_low_u16(q8u16);
- d17u16 = vget_high_u16(q8u16);
- d18u16 = vget_low_u16(q9u16);
- d19u16 = vget_high_u16(q9u16);
- q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
- q9u16 = vcombine_u16(d17u16, d19u16);
-
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
- for (width = w, psrc = src + 7, pdst = dst;
- width > 0;
- width -= 4, psrc += 4, pdst += 4) { // loop_horiz
- s = psrc;
- d28u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d29u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d31u32 = vld1_dup_u32((const uint32_t *)s);
- s += src_stride;
- d30u32 = vld1_dup_u32((const uint32_t *)s);
-
- __builtin_prefetch(psrc + 64);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
- vreinterpret_u16_u32(d31u32));
- d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
- vreinterpret_u16_u32(d30u32));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
- vreinterpret_u8_u16(d1x2u16.val[0])); // d29
- d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
- vreinterpret_u8_u16(d1x2u16.val[1])); // d30
-
- __builtin_prefetch(psrc + 64 + src_stride);
-
- q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
- q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
- q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
- vreinterpretq_u32_u8(q15u8));
-
- d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
- d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
- q12u16 = vmovl_u8(d28u8);
- q13u16 = vmovl_u8(d29u8);
-
- __builtin_prefetch(psrc + 64 + src_stride * 2);
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
- d18s16, d19s16, d23s16, d24s16, q0s16);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
- d19s16, d23s16, d24s16, d26s16, q0s16);
- q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
- d23s16, d24s16, d26s16, d27s16, q0s16);
- q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- __builtin_prefetch(psrc + 60 + src_stride * 3);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u8 = vqmovn_u16(q1u16);
- d3u8 = vqmovn_u16(q2u16);
-
- d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
- vreinterpret_u16_u8(d3u8));
- d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
- vreinterpret_u32_u16(d0x2u16.val[1]));
- d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
- vreinterpret_u8_u32(d0x2u32.val[1]));
-
- d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
- d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
-
- d = pdst;
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
-
- q8u16 = q9u16;
- d20s16 = d23s16;
- q11u16 = q12u16;
- q9u16 = q13u16;
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- }
- }
- return;
-}
-
-void vp9_convolve8_vert_neon(
- uint8_t *src,
- ptrdiff_t src_stride,
- uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, // unused
- int x_step_q4, // unused
- const int16_t *filter_y,
- int y_step_q4,
- int w,
- int h) {
- int height;
- uint8_t *s, *d;
- uint32x2_t d2u32, d3u32;
- uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16;
- uint16x4_t d2u16, d3u16, d4u16, d5u16;
- int16x8_t q0s16;
- uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
- int32x4_t q1s32, q2s32, q14s32, q15s32;
-
- if (y_step_q4 != 16) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
- filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
- return;
- }
-
- src -= src_stride * 3;
- q0s16 = vld1q_s16(filter_y);
- for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
- s = src;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
- s += src_stride;
- d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
- s += src_stride;
- d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
- s += src_stride;
- d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
- s += src_stride;
- d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
- s += src_stride;
-
- q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
- q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
- q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
- q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
-
- d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
- d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d = dst;
- for (height = h; height > 0; height -= 4) { // loop_vert
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
- s += src_stride;
- d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
- s += src_stride;
- d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
- s += src_stride;
-
- q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
- q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
-
- d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
- d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
- d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
- d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-
- __builtin_prefetch(d);
- __builtin_prefetch(d + dst_stride);
- q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
- d20s16, d21s16, d22s16, d24s16, q0s16);
- __builtin_prefetch(d + dst_stride * 2);
- __builtin_prefetch(d + dst_stride * 3);
- q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
- d21s16, d22s16, d24s16, d26s16, q0s16);
- __builtin_prefetch(s);
- __builtin_prefetch(s + src_stride);
- q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
- d22s16, d24s16, d26s16, d27s16, q0s16);
- __builtin_prefetch(s + src_stride * 2);
- __builtin_prefetch(s + src_stride * 3);
- q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
- d24s16, d26s16, d27s16, d25s16, q0s16);
-
- d2u16 = vqrshrun_n_s32(q1s32, 7);
- d3u16 = vqrshrun_n_s32(q2s32, 7);
- d4u16 = vqrshrun_n_s32(q14s32, 7);
- d5u16 = vqrshrun_n_s32(q15s32, 7);
-
- q1u16 = vcombine_u16(d2u16, d3u16);
- q2u16 = vcombine_u16(d4u16, d5u16);
-
- d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
- d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
-
- vst1_lane_u32((uint32_t *)d, d2u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d2u32, 1);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 0);
- d += dst_stride;
- vst1_lane_u32((uint32_t *)d, d3u32, 1);
- d += dst_stride;
-
- q8u16 = q10u16;
- d18s16 = d22s16;
- d19s16 = d24s16;
- q10u16 = q13u16;
- d22s16 = d25s16;
- }
- }
- return;
-}
+++ /dev/null
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-void vp9_convolve_avg_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
- uint8_t *d;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint32x2_t d0u32, d2u32;
- uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- d = dst;
- if (w > 32) { // avg64
- for (; h > 0; h -= 1) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
- src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
- q10u8 = vld1q_u8(d + 32);
- q11u8 = vld1q_u8(d + 48);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q8u8);
- q1u8 = vrhaddq_u8(q1u8, q9u8);
- q2u8 = vrhaddq_u8(q2u8, q10u8);
- q3u8 = vrhaddq_u8(q3u8, q11u8);
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- vst1q_u8(dst + 32, q2u8);
- vst1q_u8(dst + 48, q3u8);
- dst += dst_stride;
- }
- } else if (w == 32) { // avg32
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- q3u8 = vld1q_u8(src + 16);
- src += src_stride;
- q8u8 = vld1q_u8(d);
- q9u8 = vld1q_u8(d + 16);
- d += dst_stride;
- q10u8 = vld1q_u8(d);
- q11u8 = vld1q_u8(d + 16);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q8u8);
- q1u8 = vrhaddq_u8(q1u8, q9u8);
- q2u8 = vrhaddq_u8(q2u8, q10u8);
- q3u8 = vrhaddq_u8(q3u8, q11u8);
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- dst += dst_stride;
- vst1q_u8(dst, q2u8);
- vst1q_u8(dst + 16, q3u8);
- dst += dst_stride;
- }
- } else if (w > 8) { // avg16
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(src);
- src += src_stride;
- q2u8 = vld1q_u8(d);
- d += dst_stride;
- q3u8 = vld1q_u8(d);
- d += dst_stride;
-
- q0u8 = vrhaddq_u8(q0u8, q2u8);
- q1u8 = vrhaddq_u8(q1u8, q3u8);
-
- vst1q_u8(dst, q0u8);
- dst += dst_stride;
- vst1q_u8(dst, q1u8);
- dst += dst_stride;
- }
- } else if (w == 8) { // avg8
- for (; h > 0; h -= 2) {
- d0u8 = vld1_u8(src);
- src += src_stride;
- d1u8 = vld1_u8(src);
- src += src_stride;
- d2u8 = vld1_u8(d);
- d += dst_stride;
- d3u8 = vld1_u8(d);
- d += dst_stride;
-
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- q0u8 = vrhaddq_u8(q0u8, q1u8);
-
- vst1_u8(dst, vget_low_u8(q0u8));
- dst += dst_stride;
- vst1_u8(dst, vget_high_u8(q0u8));
- dst += dst_stride;
- }
- } else { // avg4
- for (; h > 0; h -= 2) {
- d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
- src += src_stride;
- d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
- src += src_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
- d += dst_stride;
- d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
- d += dst_stride;
-
- d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
- vreinterpret_u8_u32(d2u32));
-
- d0u32 = vreinterpret_u32_u8(d0u8);
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, d0u32, 1);
- dst += dst_stride;
- }
- }
- return;
-}
+++ /dev/null
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stddef.h>
-#include <arm_neon.h>
-
-void vp9_convolve_copy_neon(
- const uint8_t *src, // r0
- ptrdiff_t src_stride, // r1
- uint8_t *dst, // r2
- ptrdiff_t dst_stride, // r3
- const int16_t *filter_x,
- int filter_x_stride,
- const int16_t *filter_y,
- int filter_y_stride,
- int w,
- int h) {
- uint8x8_t d0u8, d2u8;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- (void)filter_x; (void)filter_x_stride;
- (void)filter_y; (void)filter_y_stride;
-
- if (w > 32) { // copy64
- for (; h > 0; h--) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- q2u8 = vld1q_u8(src + 32);
- q3u8 = vld1q_u8(src + 48);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- vst1q_u8(dst + 32, q2u8);
- vst1q_u8(dst + 48, q3u8);
- dst += dst_stride;
- }
- } else if (w == 32) { // copy32
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- q1u8 = vld1q_u8(src + 16);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- q3u8 = vld1q_u8(src + 16);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- dst += dst_stride;
- vst1q_u8(dst, q2u8);
- vst1q_u8(dst + 16, q3u8);
- dst += dst_stride;
- }
- } else if (w > 8) { // copy16
- for (; h > 0; h -= 2) {
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(src);
- src += src_stride;
-
- vst1q_u8(dst, q0u8);
- dst += dst_stride;
- vst1q_u8(dst, q1u8);
- dst += dst_stride;
- }
- } else if (w == 8) { // copy8
- for (; h > 0; h -= 2) {
- d0u8 = vld1_u8(src);
- src += src_stride;
- d2u8 = vld1_u8(src);
- src += src_stride;
-
- vst1_u8(dst, d0u8);
- dst += dst_stride;
- vst1_u8(dst, d2u8);
- dst += dst_stride;
- }
- } else { // copy4
- for (; h > 0; h--) {
- *(uint32_t *)dst = *(const uint32_t *)src;
- src += src_stride;
- dst += dst_stride;
- }
- }
- return;
-}
#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
#define VP9_COMMON_VP9_ENTROPYMODE_H_
-#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
extern "C" {
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
extern "C" {
#endif
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
#define EIGHTTAP 0
#define EIGHTTAP_SMOOTH 1
#define EIGHTTAP_SHARP 2
// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
#define SWITCHABLE 4 /* should be the last one */
-typedef uint8_t INTERP_FILTER;
-typedef int16_t InterpKernel[SUBPEL_TAPS];
+typedef uint8_t INTERP_FILTER;
extern const InterpKernel *vp9_filter_kernels[4];
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_enums.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h"
-#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
#ifndef VP9_COMMON_VP9_RECONINTER_H_
#define VP9_COMMON_VP9_RECONINTER_H_
-#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
#ifdef __cplusplus
extern "C" {
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_once.h"
$avx2_x86_64 = 'avx2';
}
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
- $avx2_ssse3 = 'avx2';
-}
-
#
# post proc
#
}
#
-# Sub Pixel Filters
-#
-add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
-
-#
# dct
#
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_scale.h"
+#include "vpx_dsp/vpx_filter.h"
static INLINE int scaled_x(int val, const struct scale_factors *sf) {
return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT);
if (sf->x_step_q4 == 16) {
if (sf->y_step_q4 == 16) {
// No scaling in either direction.
- sf->predict[0][0][0] = vp9_convolve_copy;
- sf->predict[0][0][1] = vp9_convolve_avg;
- sf->predict[0][1][0] = vp9_convolve8_vert;
- sf->predict[0][1][1] = vp9_convolve8_avg_vert;
- sf->predict[1][0][0] = vp9_convolve8_horiz;
- sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ sf->predict[0][0][0] = vpx_convolve_copy;
+ sf->predict[0][0][1] = vpx_convolve_avg;
+ sf->predict[0][1][0] = vpx_convolve8_vert;
+ sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+ sf->predict[1][0][0] = vpx_convolve8_horiz;
+ sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
- sf->predict[0][0][0] = vp9_convolve8_vert;
- sf->predict[0][0][1] = vp9_convolve8_avg_vert;
- sf->predict[0][1][0] = vp9_convolve8_vert;
- sf->predict[0][1][1] = vp9_convolve8_avg_vert;
- sf->predict[1][0][0] = vp9_convolve8;
- sf->predict[1][0][1] = vp9_convolve8_avg;
+ sf->predict[0][0][0] = vpx_convolve8_vert;
+ sf->predict[0][0][1] = vpx_convolve8_avg_vert;
+ sf->predict[0][1][0] = vpx_convolve8_vert;
+ sf->predict[0][1][1] = vpx_convolve8_avg_vert;
+ sf->predict[1][0][0] = vpx_convolve8;
+ sf->predict[1][0][1] = vpx_convolve8_avg;
}
} else {
if (sf->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
- sf->predict[0][0][0] = vp9_convolve8_horiz;
- sf->predict[0][0][1] = vp9_convolve8_avg_horiz;
- sf->predict[0][1][0] = vp9_convolve8;
- sf->predict[0][1][1] = vp9_convolve8_avg;
- sf->predict[1][0][0] = vp9_convolve8_horiz;
- sf->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ sf->predict[0][0][0] = vpx_convolve8_horiz;
+ sf->predict[0][0][1] = vpx_convolve8_avg_horiz;
+ sf->predict[0][1][0] = vpx_convolve8;
+ sf->predict[0][1][1] = vpx_convolve8_avg;
+ sf->predict[1][0][0] = vpx_convolve8_horiz;
+ sf->predict[1][0][1] = vpx_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
- sf->predict[0][0][0] = vp9_convolve8;
- sf->predict[0][0][1] = vp9_convolve8_avg;
- sf->predict[0][1][0] = vp9_convolve8;
- sf->predict[0][1][1] = vp9_convolve8_avg;
- sf->predict[1][0][0] = vp9_convolve8;
- sf->predict[1][0][1] = vp9_convolve8_avg;
+ sf->predict[0][0][0] = vpx_convolve8;
+ sf->predict[0][0][1] = vpx_convolve8_avg;
+ sf->predict[0][1][0] = vpx_convolve8;
+ sf->predict[0][1][1] = vpx_convolve8_avg;
+ sf->predict[1][0][0] = vpx_convolve8;
+ sf->predict[1][0][1] = vpx_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions
- sf->predict[1][1][0] = vp9_convolve8;
- sf->predict[1][1][1] = vp9_convolve8_avg;
+ sf->predict[1][1][0] = vpx_convolve8;
+ sf->predict[1][1][1] = vpx_convolve8_avg;
#if CONFIG_VP9_HIGHBITDEPTH
if (use_highbd) {
if (sf->x_step_q4 == 16) {
if (sf->y_step_q4 == 16) {
// No scaling in either direction.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve_copy;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve_avg;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve_copy;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve_avg;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_vert;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_vert;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8_vert;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg_vert;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_vert;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_vert;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8_vert;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg_vert;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
}
} else {
if (sf->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve8_horiz;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg_horiz;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8_horiz;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg_horiz;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve8_horiz;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg_horiz;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8_horiz;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
- sf->highbd_predict[0][0][0] = vp9_highbd_convolve8;
- sf->highbd_predict[0][0][1] = vp9_highbd_convolve8_avg;
- sf->highbd_predict[0][1][0] = vp9_highbd_convolve8;
- sf->highbd_predict[0][1][1] = vp9_highbd_convolve8_avg;
- sf->highbd_predict[1][0][0] = vp9_highbd_convolve8;
- sf->highbd_predict[1][0][1] = vp9_highbd_convolve8_avg;
+ sf->highbd_predict[0][0][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[0][0][1] = vpx_highbd_convolve8_avg;
+ sf->highbd_predict[0][1][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[0][1][1] = vpx_highbd_convolve8_avg;
+ sf->highbd_predict[1][0][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[1][0][1] = vpx_highbd_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions.
- sf->highbd_predict[1][1][0] = vp9_highbd_convolve8;
- sf->highbd_predict[1][1][1] = vp9_highbd_convolve8_avg;
+ sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
+ sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
}
#endif
}
#define VP9_COMMON_VP9_SCALE_H_
#include "vp9/common/vp9_mv.h"
-#include "vp9/common/vp9_convolve.h"
+#include "vpx_dsp/vpx_convolve.h"
#ifdef __cplusplus
extern "C" {
#include <stdlib.h> // qsort()
#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx_dsp/bitreader_buffer.h"
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
#include "vp9/common/vp9_filter.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
static int horizontal_filter(const uint8_t *s) {
#include <assert.h>
#include <limits.h>
+#include "./vpx_dsp_rtcd.h"
#include "vpx_scale/yv12config.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_reconinter.h"
}
if (decision == FILTER_BLOCK) {
- vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
NULL, 0, NULL, 0,
num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
} else { // COPY_BLOCK
- vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
NULL, 0, NULL, 0,
num_4x4_blocks_wide_lookup[bs] << 2,
num_4x4_blocks_high_lookup[bs] << 2);
#include <stdio.h>
#include <limits.h>
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "./vpx_scale_rtcd.h"
#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
#include "vpx_scale/vpx_scale.h"
#if CONFIG_VP9_HIGHBITDEPTH
if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
- vp9_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor, bd);
} else {
- vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor);
}
#else
- vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+ vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
kernel[x_q4 & 0xf], 16 * src_w / dst_w,
kernel[y_q4 & 0xf], 16 * src_h / dst_h,
16 / factor, 16 / factor);
this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh, xd->bd);
else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh);
#else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
this_mode_pred->data, this_mode_pred->stride,
NULL, 0, NULL, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth)
- vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_highbd_convolve_copy(best_pred->data, best_pred->stride,
pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh, xd->bd);
else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh);
#else
- vp9_convolve_copy(best_pred->data, best_pred->stride,
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
pd->dst.buf, pd->dst.stride, NULL, 0,
NULL, 0, bw, bh);
#endif // CONFIG_VP9_HIGHBITDEPTH
#include <stdlib.h>
#include <string.h>
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
#include "vp9/encoder/vp9_resize.h"
VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
VP9_COMMON_SRCS-yes += common/vp9_blockd.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.c
-VP9_COMMON_SRCS-yes += common/vp9_convolve.h
VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
VP9_COMMON_SRCS-yes += common/vp9_entropy.c
VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.c
-VP9_COMMON_SRCS-yes += common/vp9_filter.h
VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.c
VP9_COMMON_SRCS-yes += common/vp9_frame_buffers.h
VP9_COMMON_SRCS-yes += common/vp9_idct.c
VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
VP9_COMMON_SRCS-yes += common/vp9_enums.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.h
+VP9_COMMON_SRCS-yes += common/vp9_filter.c
VP9_COMMON_SRCS-yes += common/vp9_idct.h
VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
VP9_COMMON_SRCS-yes += common/vp9_scan.c
VP9_COMMON_SRCS-yes += common/vp9_scan.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
-VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
-ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
-endif
-
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm
-endif
-
# common (c)
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c
endif
# common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
# neon with assembly and intrinsics implementations. If both are available
# prefer assembly.
ifeq ($(HAVE_NEON_ASM), yes)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
else
ifeq ($(HAVE_NEON), yes)
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
--- /dev/null
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vpx_convolve8_avg_horiz_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+}
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ src += 7;
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w;
+ width > 0;
+ width -= 4, src += 4, dst += 4) { // loop_horiz
+ s = src;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(src + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(src + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(src + 64 + src_stride * 2);
+
+ d = dst;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(src + 64 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ d = dst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ src += src_stride * 4 - w - 7;
+ dst += dst_stride * 4 - w;
+ }
+ return;
+}
+
+void vpx_convolve8_avg_vert_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t d2u8, d3u8;
+ uint32x2_t d2u32, d3u32, d6u32, d7u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ uint8x16_t q1u8, q3u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+ d += dst_stride;
+ d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+ d += dst_stride;
+ d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+ d -= dst_stride * 3;
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+ d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
; VP9_FILTER_WEIGHT == 128
; VP9_FILTER_SHIFT == 7
- EXPORT |vp9_convolve8_avg_horiz_neon|
- EXPORT |vp9_convolve8_avg_vert_neon|
- IMPORT |vp9_convolve8_avg_horiz_c|
- IMPORT |vp9_convolve8_avg_vert_c|
+ EXPORT |vpx_convolve8_avg_horiz_neon|
+ EXPORT |vpx_convolve8_avg_vert_neon|
+ IMPORT |vpx_convolve8_avg_horiz_c|
+ IMPORT |vpx_convolve8_avg_vert_c|
ARM
REQUIRE8
PRESERVE8
; sp[]int w
; sp[]int h
-|vp9_convolve8_avg_horiz_neon| PROC
+|vpx_convolve8_avg_horiz_neon| PROC
ldr r12, [sp, #4] ; x_step_q4
cmp r12, #16
- bne vp9_convolve8_avg_horiz_c
+ bne vpx_convolve8_avg_horiz_c
push {r4-r10, lr}
mov r10, r6 ; w loop counter
-vp9_convolve8_avg_loop_horiz_v
+vpx_convolve8_avg_loop_horiz_v
vld1.8 {d24}, [r0], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d26}, [r0], r1
add r0, r0, #3
-vp9_convolve8_avg_loop_horiz
+vpx_convolve8_avg_loop_horiz
add r5, r0, #64
vld1.32 {d28[]}, [r0], r1
vmov q9, q13
subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_avg_loop_horiz
+ bgt vpx_convolve8_avg_loop_horiz
; outer loop
mov r6, r10 ; restore w counter
add r0, r0, r9 ; src += src_stride * 4 - w
add r2, r2, r12 ; dst += dst_stride * 4 - w
subs r7, r7, #4 ; h -= 4
- bgt vp9_convolve8_avg_loop_horiz_v
+ bgt vpx_convolve8_avg_loop_horiz_v
pop {r4-r10, pc}
ENDP
-|vp9_convolve8_avg_vert_neon| PROC
+|vpx_convolve8_avg_vert_neon| PROC
ldr r12, [sp, #12]
cmp r12, #16
- bne vp9_convolve8_avg_vert_c
+ bne vpx_convolve8_avg_vert_c
push {r4-r8, lr}
lsl r1, r1, #1
lsl r3, r3, #1
-vp9_convolve8_avg_loop_vert_h
+vpx_convolve8_avg_loop_vert_h
mov r4, r0
add r7, r0, r1, asr #1
mov r5, r2
vmovl.u8 q10, d20
vmovl.u8 q11, d22
-vp9_convolve8_avg_loop_vert
+vpx_convolve8_avg_loop_vert
; always process a 4x4 block at a time
vld1.u32 {d24[0]}, [r7], r1
vld1.u32 {d26[0]}, [r4], r1
vmov d22, d25
subs r12, r12, #4 ; h -= 4
- bgt vp9_convolve8_avg_loop_vert
+ bgt vpx_convolve8_avg_loop_vert
; outer loop
add r0, r0, #4
add r2, r2, #4
subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_avg_loop_vert_h
+ bgt vpx_convolve8_avg_loop_vert_h
pop {r4-r8, pc}
--- /dev/null
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+ int16x4_t dsrc0,
+ int16x4_t dsrc1,
+ int16x4_t dsrc2,
+ int16x4_t dsrc3,
+ int16x4_t dsrc4,
+ int16x4_t dsrc5,
+ int16x4_t dsrc6,
+ int16x4_t dsrc7,
+ int16x8_t q0s16) {
+ int32x4_t qdst;
+ int16x4_t d0s16, d1s16;
+
+ d0s16 = vget_low_s16(q0s16);
+ d1s16 = vget_high_s16(q0s16);
+
+ qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+ qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+ qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+ qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+ qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+ return qdst;
+}
+
+void vpx_convolve8_horiz_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x,
+ int x_step_q4,
+ const int16_t *filter_y, // unused
+ int y_step_q4, // unused
+ int w,
+ int h) {
+ int width;
+ const uint8_t *s, *psrc;
+ uint8_t *d, *pdst;
+ uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+ uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+ uint8x16_t q12u8, q13u8, q14u8, q15u8;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+ uint16x8x2_t q0x2u16;
+ uint8x8x2_t d0x2u8, d1x2u8;
+ uint32x2x2_t d0x2u32;
+ uint16x4x2_t d0x2u16, d1x2u16;
+ uint32x4x2_t q0x2u32;
+
+ if (x_step_q4 != 16) {
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ q0s16 = vld1q_s16(filter_x);
+
+ src -= 3; // adjust for taps
+ for (; h > 0; h -= 4,
+ src += src_stride * 4,
+ dst += dst_stride * 4) { // loop_horiz_v
+ s = src;
+ d24u8 = vld1_u8(s);
+ s += src_stride;
+ d25u8 = vld1_u8(s);
+ s += src_stride;
+ d26u8 = vld1_u8(s);
+ s += src_stride;
+ d27u8 = vld1_u8(s);
+
+ q12u8 = vcombine_u8(d24u8, d25u8);
+ q13u8 = vcombine_u8(d26u8, d27u8);
+
+ q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+ vreinterpretq_u16_u8(q13u8));
+ d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+ d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+ d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+ d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(d24u8, d25u8);
+ d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+ __builtin_prefetch(src + src_stride * 4);
+ __builtin_prefetch(src + src_stride * 5);
+ __builtin_prefetch(src + src_stride * 6);
+
+ q8u16 = vmovl_u8(d0x2u8.val[0]);
+ q9u16 = vmovl_u8(d0x2u8.val[1]);
+ q10u16 = vmovl_u8(d1x2u8.val[0]);
+ q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+ d16u16 = vget_low_u16(q8u16);
+ d17u16 = vget_high_u16(q8u16);
+ d18u16 = vget_low_u16(q9u16);
+ d19u16 = vget_high_u16(q9u16);
+ q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18
+ q9u16 = vcombine_u16(d17u16, d19u16);
+
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21
+ for (width = w, psrc = src + 7, pdst = dst;
+ width > 0;
+ width -= 4, psrc += 4, pdst += 4) { // loop_horiz
+ s = psrc;
+ d28u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d29u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d31u32 = vld1_dup_u32((const uint32_t *)s);
+ s += src_stride;
+ d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+ __builtin_prefetch(psrc + 64);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+ vreinterpret_u16_u32(d31u32));
+ d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+ vreinterpret_u16_u32(d30u32));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28
+ vreinterpret_u8_u16(d1x2u16.val[0])); // d29
+ d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31
+ vreinterpret_u8_u16(d1x2u16.val[1])); // d30
+
+ __builtin_prefetch(psrc + 64 + src_stride);
+
+ q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+ q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+ q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+ vreinterpretq_u32_u8(q15u8));
+
+ d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+ d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+ q12u16 = vmovl_u8(d28u8);
+ q13u16 = vmovl_u8(d29u8);
+
+ __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+ d18s16, d19s16, d23s16, d24s16, q0s16);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+ d19s16, d23s16, d24s16, d26s16, q0s16);
+ q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+ d23s16, d24s16, d26s16, d27s16, q0s16);
+ q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u8 = vqmovn_u16(q1u16);
+ d3u8 = vqmovn_u16(q2u16);
+
+ d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+ vreinterpret_u16_u8(d3u8));
+ d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+ vreinterpret_u32_u16(d0x2u16.val[1]));
+ d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+ vreinterpret_u8_u32(d0x2u32.val[1]));
+
+ d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+ d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+ d = pdst;
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+ q8u16 = q9u16;
+ d20s16 = d23s16;
+ q11u16 = q12u16;
+ q9u16 = q13u16;
+ d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+ }
+ }
+ return;
+}
+
+void vpx_convolve8_vert_neon(
+ const uint8_t *src,
+ ptrdiff_t src_stride,
+ uint8_t *dst,
+ ptrdiff_t dst_stride,
+ const int16_t *filter_x, // unused
+ int x_step_q4, // unused
+ const int16_t *filter_y,
+ int y_step_q4,
+ int w,
+ int h) {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint32x2_t d2u32, d3u32;
+ uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+ int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+ int16x4_t d24s16, d25s16, d26s16, d27s16;
+ uint16x4_t d2u16, d3u16, d4u16, d5u16;
+ int16x8_t q0s16;
+ uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+ int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+ if (y_step_q4 != 16) {
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4,
+ filter_y, y_step_q4, w, h);
+ return;
+ }
+
+ src -= src_stride * 3;
+ q0s16 = vld1q_s16(filter_y);
+ for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h
+ s = src;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+ s += src_stride;
+ d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+ s += src_stride;
+ d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+ s += src_stride;
+ d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+ s += src_stride;
+ d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+ s += src_stride;
+
+ q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32));
+ q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32));
+ q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+ q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+ d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+ d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+ d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+ d = dst;
+ for (height = h; height > 0; height -= 4) { // loop_vert
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+ s += src_stride;
+ d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+ s += src_stride;
+ d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+ s += src_stride;
+
+ q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+ q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+ d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+ d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+ d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+ d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+ d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+ d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+ d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+ d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+ __builtin_prefetch(d);
+ __builtin_prefetch(d + dst_stride);
+ q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+ d20s16, d21s16, d22s16, d24s16, q0s16);
+ __builtin_prefetch(d + dst_stride * 2);
+ __builtin_prefetch(d + dst_stride * 3);
+ q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+ d21s16, d22s16, d24s16, d26s16, q0s16);
+ __builtin_prefetch(s);
+ __builtin_prefetch(s + src_stride);
+ q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+ d22s16, d24s16, d26s16, d27s16, q0s16);
+ __builtin_prefetch(s + src_stride * 2);
+ __builtin_prefetch(s + src_stride * 3);
+ q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+ d24s16, d26s16, d27s16, d25s16, q0s16);
+
+ d2u16 = vqrshrun_n_s32(q1s32, 7);
+ d3u16 = vqrshrun_n_s32(q2s32, 7);
+ d4u16 = vqrshrun_n_s32(q14s32, 7);
+ d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+ q1u16 = vcombine_u16(d2u16, d3u16);
+ q2u16 = vcombine_u16(d4u16, d5u16);
+
+ d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+ d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+ vst1_lane_u32((uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 0);
+ d += dst_stride;
+ vst1_lane_u32((uint32_t *)d, d3u32, 1);
+ d += dst_stride;
+
+ q8u16 = q10u16;
+ d18s16 = d22s16;
+ d19s16 = d24s16;
+ q10u16 = q13u16;
+ d22s16 = d25s16;
+ }
+ }
+ return;
+}
; VP9_FILTER_WEIGHT == 128
; VP9_FILTER_SHIFT == 7
- EXPORT |vp9_convolve8_horiz_neon|
- EXPORT |vp9_convolve8_vert_neon|
- IMPORT |vp9_convolve8_horiz_c|
- IMPORT |vp9_convolve8_vert_c|
+ EXPORT |vpx_convolve8_horiz_neon|
+ EXPORT |vpx_convolve8_vert_neon|
+ IMPORT |vpx_convolve8_horiz_c|
+ IMPORT |vpx_convolve8_vert_c|
ARM
REQUIRE8
PRESERVE8
; sp[]int w
; sp[]int h
-|vp9_convolve8_horiz_neon| PROC
+|vpx_convolve8_horiz_neon| PROC
ldr r12, [sp, #4] ; x_step_q4
cmp r12, #16
- bne vp9_convolve8_horiz_c
+ bne vpx_convolve8_horiz_c
push {r4-r10, lr}
mov r10, r6 ; w loop counter
-vp9_convolve8_loop_horiz_v
+vpx_convolve8_loop_horiz_v
vld1.8 {d24}, [r0], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d26}, [r0], r1
add r0, r0, #3
-vp9_convolve8_loop_horiz
+vpx_convolve8_loop_horiz
add r5, r0, #64
vld1.32 {d28[]}, [r0], r1
vmov q9, q13
subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_loop_horiz
+ bgt vpx_convolve8_loop_horiz
; outer loop
mov r6, r10 ; restore w counter
add r0, r0, r9 ; src += src_stride * 4 - w
add r2, r2, r12 ; dst += dst_stride * 4 - w
subs r7, r7, #4 ; h -= 4
- bgt vp9_convolve8_loop_horiz_v
+ bgt vpx_convolve8_loop_horiz_v
pop {r4-r10, pc}
ENDP
-|vp9_convolve8_vert_neon| PROC
+|vpx_convolve8_vert_neon| PROC
ldr r12, [sp, #12]
cmp r12, #16
- bne vp9_convolve8_vert_c
+ bne vpx_convolve8_vert_c
push {r4-r8, lr}
lsl r1, r1, #1
lsl r3, r3, #1
-vp9_convolve8_loop_vert_h
+vpx_convolve8_loop_vert_h
mov r4, r0
add r7, r0, r1, asr #1
mov r5, r2
vmovl.u8 q10, d20
vmovl.u8 q11, d22
-vp9_convolve8_loop_vert
+vpx_convolve8_loop_vert
; always process a 4x4 block at a time
vld1.u32 {d24[0]}, [r7], r1
vld1.u32 {d26[0]}, [r4], r1
vmov d22, d25
subs r12, r12, #4 ; h -= 4
- bgt vp9_convolve8_loop_vert
+ bgt vpx_convolve8_loop_vert
; outer loop
add r0, r0, #4
add r2, r2, #4
subs r6, r6, #4 ; w -= 4
- bgt vp9_convolve8_loop_vert_h
+ bgt vpx_convolve8_loop_vert_h
pop {r4-r8, pc}
--- /dev/null
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_avg_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8_t *d;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8;
+ uint32x2_t d0u32, d2u32;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ d = dst;
+ if (w > 32) { // avg64
+ for (; h > 0; h -= 1) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ q10u8 = vld1q_u8(d + 32);
+ q11u8 = vld1q_u8(d + 48);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // avg32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q8u8 = vld1q_u8(d);
+ q9u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+ q10u8 = vld1q_u8(d);
+ q11u8 = vld1q_u8(d + 16);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q8u8);
+ q1u8 = vrhaddq_u8(q1u8, q9u8);
+ q2u8 = vrhaddq_u8(q2u8, q10u8);
+ q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // avg16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+ q2u8 = vld1q_u8(d);
+ d += dst_stride;
+ q3u8 = vld1q_u8(d);
+ d += dst_stride;
+
+ q0u8 = vrhaddq_u8(q0u8, q2u8);
+ q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // avg8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d1u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(d);
+ d += dst_stride;
+ d3u8 = vld1_u8(d);
+ d += dst_stride;
+
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+ vst1_u8(dst, vget_low_u8(q0u8));
+ dst += dst_stride;
+ vst1_u8(dst, vget_high_u8(q0u8));
+ dst += dst_stride;
+ }
+ } else { // avg4
+ for (; h > 0; h -= 2) {
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+ src += src_stride;
+ d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+ src += src_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+ d += dst_stride;
+ d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+ d += dst_stride;
+
+ d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+ vreinterpret_u8_u32(d2u32));
+
+ d0u32 = vreinterpret_u32_u8(d0u8);
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+ dst += dst_stride;
+ }
+ }
+ return;
+}
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_convolve_avg_neon|
+ EXPORT |vpx_convolve_avg_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-|vp9_convolve_avg_neon| PROC
+|vpx_convolve_avg_neon| PROC
push {r4-r6, lr}
ldrd r4, r5, [sp, #32]
mov r6, r2
--- /dev/null
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_convolve_copy_neon(
+ const uint8_t *src, // r0
+ ptrdiff_t src_stride, // r1
+ uint8_t *dst, // r2
+ ptrdiff_t dst_stride, // r3
+ const int16_t *filter_x,
+ int filter_x_stride,
+ const int16_t *filter_y,
+ int filter_y_stride,
+ int w,
+ int h) {
+ uint8x8_t d0u8, d2u8;
+ uint8x16_t q0u8, q1u8, q2u8, q3u8;
+ (void)filter_x; (void)filter_x_stride;
+ (void)filter_y; (void)filter_y_stride;
+
+ if (w > 32) { // copy64
+ for (; h > 0; h--) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ q2u8 = vld1q_u8(src + 32);
+ q3u8 = vld1q_u8(src + 48);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ vst1q_u8(dst + 32, q2u8);
+ vst1q_u8(dst + 48, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w == 32) { // copy32
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ q1u8 = vld1q_u8(src + 16);
+ src += src_stride;
+ q2u8 = vld1q_u8(src);
+ q3u8 = vld1q_u8(src + 16);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q2u8);
+ vst1q_u8(dst + 16, q3u8);
+ dst += dst_stride;
+ }
+ } else if (w > 8) { // copy16
+ for (; h > 0; h -= 2) {
+ q0u8 = vld1q_u8(src);
+ src += src_stride;
+ q1u8 = vld1q_u8(src);
+ src += src_stride;
+
+ vst1q_u8(dst, q0u8);
+ dst += dst_stride;
+ vst1q_u8(dst, q1u8);
+ dst += dst_stride;
+ }
+ } else if (w == 8) { // copy8
+ for (; h > 0; h -= 2) {
+ d0u8 = vld1_u8(src);
+ src += src_stride;
+ d2u8 = vld1_u8(src);
+ src += src_stride;
+
+ vst1_u8(dst, d0u8);
+ dst += dst_stride;
+ vst1_u8(dst, d2u8);
+ dst += dst_stride;
+ }
+ } else { // copy4
+ for (; h > 0; h--) {
+ *(uint32_t *)dst = *(const uint32_t *)src;
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+ return;
+}
; be found in the AUTHORS file in the root of the source tree.
;
- EXPORT |vp9_convolve_copy_neon|
+ EXPORT |vpx_convolve_copy_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-|vp9_convolve_copy_neon| PROC
+|vpx_convolve_copy_neon| PROC
push {r4-r5, lr}
ldrd r4, r5, [sp, #28]
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
-void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int intermediate_height = h + 7;
if (x_step_q4 != 16 || y_step_q4 != 16) {
- vp9_convolve8_c(src, src_stride,
+ vpx_convolve8_c(src, src_stride,
dst, dst_stride,
filter_x, x_step_q4,
filter_y, y_step_q4,
* the temp buffer which has lots of extra room and is subsequently discarded
* this is safe if somewhat less than ideal.
*/
- vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
temp, 64,
filter_x, x_step_q4, filter_y, y_step_q4,
w, intermediate_height);
/* Step into the temp buffer 3 lines to get the actual frame data */
- vp9_convolve8_vert_neon(temp + 64 * 3, 64,
+ vpx_convolve8_vert_neon(temp + 64 * 3, 64,
dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
}
-void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int intermediate_height = h + 7;
if (x_step_q4 != 16 || y_step_q4 != 16) {
- vp9_convolve8_avg_c(src, src_stride,
+ vpx_convolve8_avg_c(src, src_stride,
dst, dst_stride,
filter_x, x_step_q4,
filter_y, y_step_q4,
/* This implementation has the same issues as above. In addition, we only want
* to average the values after both passes.
*/
- vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
+ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride,
temp, 64,
filter_x, x_step_q4, filter_y, y_step_q4,
w, intermediate_height);
- vp9_convolve8_avg_vert_neon(temp + 64 * 3,
+ vpx_convolve8_avg_vert_neon(temp + 64 * 3,
64, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdlib.h>
+
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_ports/mem.h"
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
int32_t src_stride,
}
}
-void vp9_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int8_t cnt, filt_hor[8];
if (16 != x_step_q4) {
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
}
if (((const int32_t *)filter_x)[1] == 0x800000) {
- vp9_convolve_avg(src, src_stride, dst, dst_stride,
+ vpx_convolve_avg(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
&filt_hor[3], h);
break;
default:
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
filt_hor, h);
break;
default:
- vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride,
}
}
-void vp9_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int8_t cnt, filt_hor[8], filt_ver[8];
if (16 != x_step_q4 || 16 != y_step_q4) {
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
if (((const int32_t *)filter_x)[1] == 0x800000 &&
((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_avg(src, src_stride, dst, dst_stride,
+ vpx_convolve_avg(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
&filt_hor[3], &filt_ver[3], h);
break;
default:
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
}
} else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) {
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
} else {
filt_hor, filt_ver, h);
break;
default:
- vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
int32_t src_stride,
}
}
-void vp9_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int8_t cnt, filt_ver[8];
if (16 != y_step_q4) {
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
}
if (((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_avg(src, src_stride, dst, dst_stride,
+ vpx_convolve_avg(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
&filt_ver[3], h);
break;
default:
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
filt_ver, h);
break;
default:
- vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
}
}
-void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int8_t cnt, filt_hor[8];
if (16 != x_step_q4) {
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
}
if (((const int32_t *)filter_x)[1] == 0x800000) {
- vp9_convolve_copy(src, src_stride, dst, dst_stride,
+ vpx_convolve_copy(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
&filt_hor[3], h);
break;
default:
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
filt_hor, h);
break;
default:
- vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
const uint8_t mc_filt_mask_arr[16 * 3] = {
/* 8 width cases */
}
}
-void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int32_t x_step_q4,
const int16_t *filter_y, int32_t y_step_q4,
int8_t cnt, filt_hor[8], filt_ver[8];
if (16 != x_step_q4 || 16 != y_step_q4) {
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
if (((const int32_t *)filter_x)[1] == 0x800000 &&
((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_copy(src, src_stride, dst, dst_stride,
+ vpx_convolve_copy(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
&filt_hor[3], &filt_ver[3], (int32_t)h);
break;
default:
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
}
} else if (((const int32_t *)filter_x)[0] == 0 ||
((const int32_t *)filter_y)[0] == 0) {
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
} else {
filt_hor, filt_ver, (int32_t)h);
break;
default:
- vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/vpx_convolve_msa.h"
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride,
}
}
-void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int8_t cnt, filt_ver[8];
if (16 != y_step_q4) {
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
}
if (((const int32_t *)filter_y)[1] == 0x800000) {
- vp9_convolve_copy(src, src_stride, dst, dst_stride,
+ vpx_convolve_copy(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
return;
&filt_ver[3], h);
break;
default:
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
filt_ver, h);
break;
default:
- vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ vpx_convolve8_vert_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h);
break;
}
}
-void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int32_t filter_x_stride,
const int16_t *filter_y, int32_t filter_y_stride,
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
}
-void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int32_t filter_x_stride,
const int16_t *filter_y, int32_t filter_y_stride,
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
+#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
-#include "vp9/common/vp9_filter.h"
#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/vpx_filter.h"
extern const uint8_t mc_filt_mask_arr[16 * 3];
AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
}
-#endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
+#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
*/
#include <assert.h>
+#include <string.h>
#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_convolve.h"
-#include "vp9/common/vp9_filter.h"
+#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
#include "vpx_ports/mem.h"
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
return (int)((const InterpKernel *)(intptr_t)f - base);
}
-void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
x0_q4, x_step_q4, w, h);
}
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
x0_q4, x_step_q4, w, h);
}
-void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
y0_q4, y_step_q4, w, h);
}
-void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
y0_q4, y_step_q4, w, h);
}
-void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
filters_y, y0_q4, y_step_q4, w, h);
}
-void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
assert(w <= 64);
assert(h <= 64);
- vp9_convolve8_c(src, src_stride, temp, 64,
+ vpx_convolve8_c(src, src_stride, temp, 64,
filter_x, x_step_q4, filter_y, y_step_q4, w, h);
- vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+ vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
}
-void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
}
}
-void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
}
-void vp9_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
x0_q4, x_step_q4, w, h, bd);
}
-void vp9_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
x0_q4, x_step_q4, w, h, bd);
}
-void vp9_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
y0_q4, y_step_q4, w, h, bd);
}
-void vp9_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
y0_q4, y_step_q4, w, h, bd);
}
-void vp9_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
filters_y, y0_q4, y_step_q4, w, h, bd);
}
-void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
assert(w <= 64);
assert(h <= 64);
- vp9_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+ vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
- vp9_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+ vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
NULL, 0, NULL, 0, w, h, bd);
}
-void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
}
}
-void vp9_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
+void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
uint8_t *dst8, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP9_COMMON_VP9_CONVOLVE_H_
-#define VP9_COMMON_VP9_CONVOLVE_H_
+#ifndef VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_DSP_VPX_CONVOLVE_H_
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
} // extern "C"
#endif
-#endif // VP9_COMMON_VP9_CONVOLVE_H_
+#endif // VPX_DSP_VPX_CONVOLVE_H_
DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c
endif # CONFIG_VP9
+# interpolation filters
+DSP_SRCS-yes += vpx_convolve.c
+DSP_SRCS-yes += vpx_convolve.h
+DSP_SRCS-yes += vpx_filter.h
+
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_bilinear_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
+DSP_SRCS-$(HAVE_AVX2) += x86/vpx_subpixel_8t_intrin_avx2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm
+endif
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
+endif
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+else
+ifeq ($(HAVE_NEON),yes)
+DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
+DSP_SRCS-yes += arm/vpx_convolve_neon.c
+endif # HAVE_NEON
+endif # HAVE_NEON_ASM
+
+# common (msa)
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_avg_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_horiz_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
+DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+
# loop filters
DSP_SRCS-yes += loopfilter.c
#ifndef VPX_DSP_COMMON_H_
#define VPX_DSP_COMMON_H_
-#include <stdlib.h>
-
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
}
}
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+ $avx2_ssse3 = 'avx2';
+}
+
# functions that are 64 bit only.
$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
if ($opts{arch} eq "x86_64") {
} # CONFIG_VP9
#
+# Sub Pixel Filters
+#
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_copy neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve_avg neon msa/, "$sse2_x86inc";
+
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8 sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_horiz sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_vert sse2 ssse3 neon msa/, "$avx2_ssse3";
+
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 neon msa/;
+
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon msa/;
+
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ #
+ # Sub Pixel Filters
+ #
+ add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve_copy/;
+
+ add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve_avg/;
+
+ add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64";
+
+ add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+ specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+} # CONFIG_VP9_HIGHBITDEPTH
+
+#
# Loopfilter
#
add_proto qw/void vpx_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
--- /dev/null
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VPX_FILTER_H_
+#define VPX_DSP_VPX_FILTER_H_
+
+#include "vpx/vpx_integer.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_DSP_VPX_FILTER_H_
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VP9_COMMON_X86_CONVOLVE_H_
-#define VP9_COMMON_X86_CONVOLVE_H_
+#ifndef VPX_DSP_X86_CONVOLVE_H_
+#define VPX_DSP_X86_CONVOLVE_H_
#include <assert.h>
);
#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ void vpx_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
if (step_q4 == 16 && filter[3] != 128) { \
if (filter[0] || filter[1] || filter[2]) { \
while (w >= 16) { \
- vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
w -= 16; \
} \
while (w >= 8) { \
- vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ vpx_filter_block1d8_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
w -= 8; \
} \
while (w >= 4) { \
- vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ vpx_filter_block1d4_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
} \
} else { \
while (w >= 16) { \
- vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+ vpx_filter_block1d16_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
w -= 16; \
} \
while (w >= 8) { \
- vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+ vpx_filter_block1d8_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
w -= 8; \
} \
while (w >= 4) { \
- vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+ vpx_filter_block1d4_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
} \
} \
if (w) { \
- vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ vpx_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h); \
} \
}
#define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
- vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 7); \
- vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} else { \
DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
- vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h + 1); \
- vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
filter_x, x_step_q4, filter_y, \
y_step_q4, w, h); \
} \
} else { \
- vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ vpx_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
);
#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+ void vpx_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
ptrdiff_t src_stride, \
uint8_t *dst8, \
ptrdiff_t dst_stride, \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
if (filter[0] || filter[1] || filter[2]) { \
while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
w -= 16; \
} \
while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
w -= 8; \
} \
while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
src_stride, \
dst, \
dst_stride, \
} \
} else { \
while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
w -= 16; \
} \
while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
w -= 8; \
} \
while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
src_stride, \
dst, \
dst_stride, \
} \
} \
if (w) { \
- vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+ vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, \
w, h, bd); \
} \
}
#define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
uint8_t *dst, ptrdiff_t dst_stride, \
const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, \
if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h + 7, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+ vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
64, dst, dst_stride, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h, bd); \
} else { \
DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+ vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
CONVERT_TO_BYTEPTR(fdata2), 64, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h + 1, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+ vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
dst, dst_stride, \
filter_x, x_step_q4, \
filter_y, y_step_q4, \
w, h, bd); \
} \
} else { \
- vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, \
h, bd); \
} \
}
#endif // CONFIG_VP9_HIGHBITDEPTH
-#endif // VP9_COMMON_X86_CONVOLVE_H_
+#endif // VPX_DSP_X86_CONVOLVE_H_
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./vp9_rtcd.h"
#include "./vpx_config.h"
-#include "vp9/common/x86/convolve.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
#if HAVE_SSE2
-filter8_1dfunction vp9_filter_block1d16_v8_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_sse2;
-filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_sse2;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
-// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
-// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
FUN_CONV_2D(avg_ , sse2);
#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vp9_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
-// void vp9_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
-// void vp9_highbd_convolve8_vert_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
// const int16_t *filter_y,
// int y_step_q4,
// int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
// ptrdiff_t src_stride,
// uint8_t *dst,
// ptrdiff_t dst_stride,
HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
sse2);
-// void vp9_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h, int bd);
-// void vp9_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
; be found in the AUTHORS file in the root of the source tree.
;
+%define program_name vpx
+
%include "third_party/x86inc/x86inc.asm"
SECTION .text
movdqu [rdi + %2], xmm0
%endm
-;void vp9_filter_block1d4_v8_sse2
+;void vpx_filter_block1d4_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_highbd_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v8_sse2):
+global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-;void vp9_filter_block1d8_v8_sse2
+;void vpx_filter_block1d8_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_highbd_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v8_sse2):
+global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-;void vp9_filter_block1d16_v8_sse2
+;void vpx_filter_block1d16_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_highbd_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v8_sse2):
+global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v8_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v8_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v8_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-;void vp9_filter_block1d4_h8_sse2
+;void vpx_filter_block1d4_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_highbd_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h8_sse2):
+global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-;void vp9_filter_block1d8_h8_sse2
+;void vpx_filter_block1d8_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_highbd_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h8_sse2):
+global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-;void vp9_filter_block1d16_h8_sse2
+;void vpx_filter_block1d16_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_highbd_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h8_sse2):
+global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h8_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h8_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h8_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
%endm
%endif
-global sym(vp9_highbd_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v2_sse2):
+global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v2_sse2):
+global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v2_sse2):
+global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%endif
-global sym(vp9_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_v2_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_v2_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_v2_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%endif
-global sym(vp9_highbd_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h2_sse2):
+global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h2_sse2):
+global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h2_sse2):
+global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%endif
-global sym(vp9_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d4_h2_avg_sse2):
+global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
ret
%if ARCH_X86_64
-global sym(vp9_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d8_h2_avg_sse2):
+global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
pop rbp
ret
-global sym(vp9_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_highbd_filter_block1d16_h2_avg_sse2):
+global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
// Due to a header conflict between math.h and intrinsics includes with ceil()
// in certain configurations under vs9 this include needs to precede
// immintrin.h.
-#include "./vp9_rtcd.h"
#include <immintrin.h>
-#include "vp9/common/x86/convolve.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
#include "vpx_ports/mem.h"
// filters for 16_h8 and 16_v8
# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
-static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
}
}
-static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
ptrdiff_t src_pitch,
uint8_t *output_ptr,
ptrdiff_t out_pitch,
}
#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
+#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
+#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
#endif // ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
+#define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
+#define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
+#define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
+#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
+#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
+#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// Due to a header conflict between math.h and intrinsics includes with ceil()
// in certain configurations under vs9 this include needs to precede
// tmmintrin.h.
-#include "./vp9_rtcd.h"
#include <tmmintrin.h>
-#include "vp9/common/x86/convolve.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/convolve.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/emmintrin_compat.h"
};
// These are reused by the avx2 intrinsics.
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+void vpx_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
}
}
-void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
}
}
-static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pixels_per_line,
uint8_t *output_ptr,
ptrdiff_t output_pitch,
}
}
-void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pitch,
uint8_t *output_ptr,
ptrdiff_t out_pitch,
}
}
-static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
ptrdiff_t src_pitch,
uint8_t *output_ptr,
ptrdiff_t out_pitch,
}
#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3
+#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
#endif // ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+
+// void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
ssse3);
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
movq [rdi + %2], xmm0
%endm
-;void vp9_filter_block1d4_v8_sse2
+;void vpx_filter_block1d4_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_sse2):
+global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d8_v8_sse2
+;void vpx_filter_block1d8_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_sse2):
+global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d16_v8_sse2
+;void vpx_filter_block1d16_v8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_sse2):
+global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_sse2):
+global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_sse2):
+global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_sse2):
+global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d4_h8_sse2
+;void vpx_filter_block1d4_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_sse2):
+global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d8_h8_sse2
+;void vpx_filter_block1d8_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_sse2):
+global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d16_h8_sse2
+;void vpx_filter_block1d16_h8_sse2
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_sse2):
+global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_sse2):
+global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_sse2):
+global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_sse2):
+global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
jnz .loop
%endm
-;void vp9_filter_block1d8_v8_ssse3
+;void vpx_filter_block1d8_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_ssse3):
+global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d8_v8_ssse3
+;void vpx_filter_block1d8_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
+global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d16_v8_ssse3
+;void vpx_filter_block1d16_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
+global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v8_avg_ssse3):
+global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_avg_ssse3):
+global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_avg_ssse3):
+global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
jnz .loop
%endm
-;void vp9_filter_block1d4_h8_ssse3
+;void vpx_filter_block1d4_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_ssse3):
+global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d8_h8_ssse3
+;void vpx_filter_block1d8_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-;void vp9_filter_block1d16_h8_ssse3
+;void vpx_filter_block1d16_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
+global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h8_avg_ssse3):
+global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_avg_ssse3):
+global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_avg_ssse3):
+global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h8_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
dec rcx
%endm
-global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_sse2):
+global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_sse2):
+global sym(vpx_filter_block1d8_v2_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_sse2):
+global sym(vpx_filter_block1d16_v2_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_sse2):
+global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_sse2):
+global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_sse2):
+global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_sse2):
+global sym(vpx_filter_block1d4_h2_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_sse2):
+global sym(vpx_filter_block1d8_h2_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_sse2):
+global sym(vpx_filter_block1d16_h2_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_sse2):
+global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_sse2):
+global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_sse2):
+global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
dec rcx
%endm
-global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_ssse3):
+global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_ssse3):
+global sym(vpx_filter_block1d8_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_ssse3):
+global sym(vpx_filter_block1d16_v2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v2_avg_ssse3):
+global sym(vpx_filter_block1d4_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_v2_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v2_avg_ssse3):
+global sym(vpx_filter_block1d8_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_v2_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v2_avg_ssse3):
+global sym(vpx_filter_block1d16_v2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_v2_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_ssse3):
+global sym(vpx_filter_block1d4_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_ssse3):
+global sym(vpx_filter_block1d8_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_ssse3):
+global sym(vpx_filter_block1d16_h2_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h2_avg_ssse3):
+global sym(vpx_filter_block1d4_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d4_h2_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h2_avg_ssse3):
+global sym(vpx_filter_block1d8_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d8_h2_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
pop rbp
ret
-global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h2_avg_ssse3):
+global sym(vpx_filter_block1d16_h2_avg_ssse3) PRIVATE
+sym(vpx_filter_block1d16_h2_avg_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6