block_error_fp highbd sse2: use tran_low_t for coeff

author Johann <johannkoenig@google.com>

Thu, 2 Feb 2017 23:28:16 +0000 (15:28 -0800)

committer Johann Koenig <johannkoenig@google.com>

Tue, 7 Feb 2017 15:03:28 +0000 (15:03 +0000)
author Johann <johannkoenig@google.com>
Thu, 2 Feb 2017 23:28:16 +0000 (15:28 -0800)
committer Johann Koenig <johannkoenig@google.com>
Tue, 7 Feb 2017 15:03:28 +0000 (15:03 +0000)
diff --git a/test/avg_test.cc b/test/avg_test.cc

index f634c7a..612aff0 100644 (file)
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -446,16 +446,12 @@ INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
                                            make_tuple(256, &vpx_satd_sse2),
                                            make_tuple(1024, &vpx_satd_sse2)));
  
-// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are
-// in place.
-#if !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSE2, BlockErrorTest,
      ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),
                        make_tuple(64, &vp9_block_error_fp_sse2),
                        make_tuple(256, &vp9_block_error_fp_sse2),
                        make_tuple(1024, &vp9_block_error_fp_sse2)));
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
  #endif  // HAVE_SSE2
  
  #if HAVE_NEON
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index ecdce7c..87aaecb 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -133,7 +133,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
    specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
  
    add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
-  specialize qw/vp9_block_error_fp/;
+  specialize qw/vp9_block_error_fp sse2/;
  
    add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
  
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm

index 5b02382..dcedf91 100644 (file)
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -11,9 +11,12 @@
  %define private_prefix vp9
  
  %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
  
  SECTION .text
  
+%if CONFIG_VP9_HIGHBITDEPTH
+%else
  ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
  ;                         int64_t *ssz)
  
@@ -74,23 +77,25 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
    movd    edx, m5
  %endif
    RET
+%endif  ; CONFIG_VP9_HIGHBITDEPTH
  
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; Compute the sum of squared difference between two tran_low_t vectors.
+; Vectors are converted (if necessary) to int16_t for calculations.
+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
  ;                            intptr_t block_size)
  
  INIT_XMM sse2
  cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
    pxor      m4, m4                 ; sse accumulator
    pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*2]
-  lea     dqcq, [dqcq+sizeq*2]
-  neg    sizeq
  .loop:
-  mova      m2, [uqcq+sizeq*2]
-  mova      m0, [dqcq+sizeq*2]
-  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m1, [dqcq+sizeq*2+mmsize]
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 1
+  LOAD_TRAN_LOW 1, dqcq, 1
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
    psubw     m0, m2
    psubw     m1, m3
    ; individual errors are max. 15bit+sign, so squares are 30bit, and
@@ -106,8 +111,7 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
    punpckhdq m1, m5
    paddq     m4, m3
    paddq     m4, m1
-  add    sizeq, mmsize
-  jl .loop
+  jnz .loop
  
    ; accumulate horizontally and store in return value
    movhlps   m5, m4
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk

index 87d9a77..ad33fa1 100644 (file)
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -107,11 +107,10 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
  endif
  
  VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
  ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
  VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
  VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-else
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
  endif
  
  ifeq ($(ARCH_X86_64),yes)
author	Johann <johannkoenig@google.com>
	Thu, 2 Feb 2017 23:28:16 +0000 (15:28 -0800)
committer	Johann Koenig <johannkoenig@google.com>
	Tue, 7 Feb 2017 15:03:28 +0000 (15:03 +0000)
test/avg_test.cc		patch \| blob \| history
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/encoder/x86/vp9_error_sse2.asm		patch \| blob \| history
vp9/vp9cx.mk		patch \| blob \| history