vp8 quantize: use native abs/sign implementations
authorJohann <johannkoenig@google.com>
Mon, 29 Apr 2019 20:05:30 +0000 (13:05 -0700)
committerJohann <johannkoenig@google.com>
Mon, 29 Apr 2019 21:00:35 +0000 (14:00 -0700)
~4% improvement with a very rudimentary speed test

Change-Id: Iad8868327e3276dbead783a79849295b0e4b135c

test/quantize_test.cc
vp8/encoder/x86/quantize_sse4.c

index 1415ce1..a749774 100644 (file)
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
+#include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -117,7 +118,8 @@ class QuantizeTestBase {
 };
 
 class QuantizeTest : public QuantizeTestBase,
-                     public ::testing::TestWithParam<VP8QuantizeParam> {
+                     public ::testing::TestWithParam<VP8QuantizeParam>,
+                     public AbstractBench {
  protected:
   virtual void SetUp() {
     SetupCompressor();
@@ -125,6 +127,10 @@ class QuantizeTest : public QuantizeTestBase,
     c_quant_ = GET_PARAM(1);
   }
 
+  virtual void Run() {
+    asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
+  }
+
   void RunComparison() {
     for (int i = 0; i < kNumBlocks; ++i) {
       ASM_REGISTER_STATE_CHECK(
@@ -167,6 +173,13 @@ TEST_P(QuantizeTest, TestMultipleQ) {
   }
 }
 
+TEST_P(QuantizeTest, DISABLED_Speed) {
+  FillCoeffRandom();
+
+  RunNTimes(10000000);
+  PrintMedian("vp8 quantize");
+}
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, QuantizeTest,
index 6f2c163..13dd1ab 100644 (file)
@@ -11,8 +11,8 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/encoder/block.h"
 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+#include "vp8/encoder/block.h"
 
 #define SELECT_EOB(i, z, x, y, q)         \
   do {                                    \
@@ -31,8 +31,7 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   char eob = 0;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
 
-  __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0,
-      dqcoeff1;
+  __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
@@ -53,15 +52,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
   zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
 
-  /* Sign of z: z >> 15 */
-  sz0 = _mm_srai_epi16(z0, 15);
-  sz1 = _mm_srai_epi16(z1, 15);
-
-  /* x = abs(z): (z ^ sz) - sz */
-  x0 = _mm_xor_si128(z0, sz0);
-  x1 = _mm_xor_si128(z1, sz1);
-  x0 = _mm_sub_epi16(x0, sz0);
-  x1 = _mm_sub_epi16(x1, sz1);
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
 
   /* zbin[] + zbin_extra */
   zbin0 = _mm_add_epi16(zbin0, zbin_extra);
@@ -89,11 +82,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   y0 = _mm_mulhi_epi16(y0, quant_shift0);
   y1 = _mm_mulhi_epi16(y1, quant_shift1);
 
-  /* Return the sign: (y ^ sz) - sz */
-  y0 = _mm_xor_si128(y0, sz0);
-  y1 = _mm_xor_si128(y1, sz1);
-  y0 = _mm_sub_epi16(y0, sz0);
-  y1 = _mm_sub_epi16(y1, sz1);
+  /* Restore the sign. */
+  y0 = _mm_sign_epi16(y0, z0);
+  y1 = _mm_sign_epi16(y1, z1);
 
   /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
   SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);