From 21044546074fa2c8a00ada0391c33309518f0a4d Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 16 Feb 2017 11:12:31 -0800 Subject: [PATCH] block error avx2: use tran_low_t Change-Id: Ic5f3a1f569d6f82afeaf4fcd7235374bb460db3c --- vp9/common/vp9_rtcd_defs.pl | 1 + vp9/encoder/x86/vp9_error_intrin_avx2.c | 8 +++++--- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/x86/bitdepth_conversion_avx2.h | 30 ++++++++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 vpx_dsp/x86/bitdepth_conversion_avx2.h diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 720e171..bf6de44 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -125,6 +125,7 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; + specialize qw/vp9_block_error avx2/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; diff --git a/vp9/encoder/x86/vp9_error_intrin_avx2.c b/vp9/encoder/x86/vp9_error_intrin_avx2.c index 453af2a..e39027f 100644 --- a/vp9/encoder/x86/vp9_error_intrin_avx2.c +++ b/vp9/encoder/x86/vp9_error_intrin_avx2.c @@ -12,8 +12,10 @@ #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" -int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, +int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; @@ -29,8 +31,8 @@ int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, for (i = 0; i < block_size; i += 16) { // load 32 bytes from coeff and dqcoeff - coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); - dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); + coeff_reg = load_tran_low(coeff + i); + dqcoeff_reg = load_tran_low(dqcoeff + i); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 33c9e51..ae4f7d8 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -13,6 +13,7 @@ DSP_SRCS-yes += vpx_dsp_common.h DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h +DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h # This file is included in libs.mk. Including it here would cause it to be # compiled into an object. Even as an empty file, this would create an diff --git a/vpx_dsp/x86/bitdepth_conversion_avx2.h b/vpx_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 0000000..b9116f0 --- /dev/null +++ b/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 16 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m256i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); +#else + return _mm256_loadu_si256((const __m256i *)a); +#endif +} + +#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ -- 2.7.4