From 405b94c661563d3ebcf57751da3b83ca943d2bcf Mon Sep 17 00:00:00 2001 From: Rafael de Lucena Valle Date: Wed, 19 Oct 2016 22:21:09 -0200 Subject: [PATCH] Add Hadamard for Power8 Change-Id: I3b4b043c1402b4100653ace4869847e030861b18 Signed-off-by: Rafael de Lucena Valle --- test/hadamard_test.cc | 53 +++++++++++++++ vpx_dsp/ppc/bitdepth_conversion_vsx.h | 47 ++++++++++++++ vpx_dsp/ppc/hadamard_vsx.c | 119 ++++++++++++++++++++++++++++++++++ vpx_dsp/ppc/transpose_vsx.h | 101 +++++++++++++++++++++++++++++ vpx_dsp/ppc/types_vsx.h | 20 ++++++ vpx_dsp/vpx_dsp.mk | 8 ++- vpx_dsp/vpx_dsp_rtcd_defs.pl | 9 ++- 7 files changed, 351 insertions(+), 6 deletions(-) create mode 100644 vpx_dsp/ppc/bitdepth_conversion_vsx.h create mode 100644 vpx_dsp/ppc/hadamard_vsx.c create mode 100644 vpx_dsp/ppc/transpose_vsx.h create mode 100644 vpx_dsp/ppc/types_vsx.h diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 7e43c69..a55b15a 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -13,6 +13,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_ports/vpx_timer.h" #include "test/acm_random.h" #include "test/register_state_check.h" @@ -99,8 +100,31 @@ class HadamardTestBase : public ::testing::TestWithParam { ACMRandom rnd_; }; +void HadamardSpeedTest(const char *name, HadamardFunc const func, + const int16_t *input, int stride, tran_low_t *output, + int times) { + int i; + vpx_usec_timer timer; + + vpx_usec_timer_start(&timer); + for (i = 0; i < times; ++i) { + func(input, stride, output); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast(vpx_usec_timer_elapsed(&timer)); + printf("%s[%12d runs]: %d us\n", name, times, elapsed_time); +} + class Hadamard8x8Test : public HadamardTestBase {}; +void HadamardSpeedTest8x8(HadamardFunc const func, int times) { + DECLARE_ALIGNED(16, int16_t, input[64]); + DECLARE_ALIGNED(16, tran_low_t, output[64]); + memset(input, 1, sizeof(input)); + HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times); +} + TEST_P(Hadamard8x8Test, CompareReferenceRandom) { DECLARE_ALIGNED(16, int16_t, a[64]); DECLARE_ALIGNED(16, tran_low_t, b[64]); @@ -142,6 +166,12 @@ TEST_P(Hadamard8x8Test, VaryStride) { } } +TEST_P(Hadamard8x8Test, DISABLED_Speed) { + HadamardSpeedTest8x8(h_func_, 10); + HadamardSpeedTest8x8(h_func_, 10000); + HadamardSpeedTest8x8(h_func_, 10000000); +} + INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_c)); @@ -169,8 +199,20 @@ INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test, #endif // HAVE_MSA #endif // !CONFIG_VP9_HIGHBITDEPTH +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test, + ::testing::Values(&vpx_hadamard_8x8_vsx)); +#endif // HAVE_VSX + class Hadamard16x16Test : public HadamardTestBase {}; +void HadamardSpeedTest16x16(HadamardFunc const func, int times) { + DECLARE_ALIGNED(16, int16_t, input[256]); + DECLARE_ALIGNED(16, tran_low_t, output[256]); + memset(input, 1, sizeof(input)); + HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times); +} + TEST_P(Hadamard16x16Test, CompareReferenceRandom) { DECLARE_ALIGNED(16, int16_t, a[16 * 16]); DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]); @@ -212,6 +254,12 @@ TEST_P(Hadamard16x16Test, VaryStride) { } } +TEST_P(Hadamard16x16Test, DISABLED_Speed) { + HadamardSpeedTest16x16(h_func_, 10); + HadamardSpeedTest16x16(h_func_, 10000); + HadamardSpeedTest16x16(h_func_, 10000000); +} + INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_c)); @@ -220,6 +268,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_sse2)); #endif // HAVE_SSE2 +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test, + ::testing::Values(&vpx_hadamard_16x16_vsx)); +#endif // HAVE_VSX + #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_neon)); diff --git a/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/vpx_dsp/ppc/bitdepth_conversion_vsx.h new file mode 100644 index 0000000..2c5d9a4 --- /dev/null +++ b/vpx_dsp/ppc/bitdepth_conversion_vsx.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ +#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + int32x4_t u = vec_vsx_ld(c, s); + int32x4_t v = vec_vsx_ld(c, s + 4); + return vec_packs(u, v); +#else + return vec_vsx_ld(c, s); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) { +#if CONFIG_VP9_HIGHBITDEPTH + const int16x8_t one = vec_splat_s16(1); + const int32x4_t even = vec_mule(v, one); + const int32x4_t odd = vec_mulo(v, one); + const int32x4_t high = vec_mergeh(even, odd); + const int32x4_t low = vec_mergel(even, odd); + vec_vsx_st(high, c, s); + vec_vsx_st(low, c, s + 4); +#else + vec_vsx_st(v, c, s); +#endif +} + +#endif // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_ diff --git a/vpx_dsp/ppc/hadamard_vsx.c b/vpx_dsp/ppc/hadamard_vsx.c new file mode 100644 index 0000000..435e3eb --- /dev/null +++ b/vpx_dsp/ppc/hadamard_vsx.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" +#include "vpx_dsp/ppc/transpose_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) { + const int16x8_t b0 = vec_add(v[0], v[1]); + const int16x8_t b1 = vec_sub(v[0], v[1]); + const int16x8_t b2 = vec_add(v[2], v[3]); + const int16x8_t b3 = vec_sub(v[2], v[3]); + const int16x8_t b4 = vec_add(v[4], v[5]); + const int16x8_t b5 = vec_sub(v[4], v[5]); + const int16x8_t b6 = vec_add(v[6], v[7]); + const int16x8_t b7 = vec_sub(v[6], v[7]); + + const int16x8_t c0 = vec_add(b0, b2); + const int16x8_t c1 = vec_add(b1, b3); + const int16x8_t c2 = vec_sub(b0, b2); + const int16x8_t c3 = vec_sub(b1, b3); + const int16x8_t c4 = vec_add(b4, b6); + const int16x8_t c5 = vec_add(b5, b7); + const int16x8_t c6 = vec_sub(b4, b6); + const int16x8_t c7 = vec_sub(b5, b7); + + v[0] = vec_add(c0, c4); + v[1] = vec_sub(c2, c6); + v[2] = vec_sub(c0, c4); + v[3] = vec_add(c2, c6); + v[4] = vec_add(c3, c7); + v[5] = vec_sub(c3, c7); + v[6] = vec_sub(c1, c5); + v[7] = vec_add(c1, c5); +} + +void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride, + tran_low_t *coeff) { + int16x8_t v[8]; + + v[0] = vec_vsx_ld(0, src_diff); + v[1] = vec_vsx_ld(0, src_diff + src_stride); + v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride)); + v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride)); + v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride)); + v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride)); + v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride)); + v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride)); + + vpx_hadamard_s16_8x8_one_pass(v); + + vpx_transpose_s16_8x8(v); + + vpx_hadamard_s16_8x8_one_pass(v); + + store_tran_low(v[0], 0, coeff); + store_tran_low(v[1], 0, coeff + 8); + store_tran_low(v[2], 0, coeff + 16); + store_tran_low(v[3], 0, coeff + 24); + store_tran_low(v[4], 0, coeff + 32); + store_tran_low(v[5], 0, coeff + 40); + store_tran_low(v[6], 0, coeff + 48); + store_tran_low(v[7], 0, coeff + 56); +} + +void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride, + tran_low_t *coeff) { + int i; + const uint16x8_t ones = vec_splat_u16(1); + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff); + /* Top right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + /* Overlay the 8x8 blocks and combine. */ + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = load_tran_low(0, coeff); + const int16x8_t a1 = load_tran_low(0, coeff + 64); + const int16x8_t a2 = load_tran_low(0, coeff + 128); + const int16x8_t a3 = load_tran_low(0, coeff + 192); + + /* Prevent the result from escaping int16_t. */ + const int16x8_t b0 = vec_sra(a0, ones); + const int16x8_t b1 = vec_sra(a1, ones); + const int16x8_t b2 = vec_sra(a2, ones); + const int16x8_t b3 = vec_sra(a3, ones); + + const int16x8_t c0 = vec_add(b0, b1); + const int16x8_t c2 = vec_add(b2, b3); + const int16x8_t c1 = vec_sub(b0, b1); + const int16x8_t c3 = vec_sub(b2, b3); + + const int16x8_t d0 = vec_add(c0, c2); + const int16x8_t d1 = vec_add(c1, c3); + const int16x8_t d2 = vec_sub(c0, c2); + const int16x8_t d3 = vec_sub(c1, c3); + + store_tran_low(d0, 0, coeff); + store_tran_low(d1, 0, coeff + 64); + store_tran_low(d2, 0, coeff + 128); + store_tran_low(d3, 0, coeff + 192); + + coeff += 8; + } +} diff --git a/vpx_dsp/ppc/transpose_vsx.h b/vpx_dsp/ppc/transpose_vsx.h new file mode 100644 index 0000000..f02556d --- /dev/null +++ b/vpx_dsp/ppc/transpose_vsx.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_ +#define VPX_DSP_PPC_TRANSPOSE_VSX_H_ + +#include "./vpx_config.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) { + // d = vec_mergeh(a,b): + // The even elements of the result are obtained left-to-right, + // from the high elements of a. + // The odd elements of the result are obtained left-to-right, + // from the high elements of b. + // + // d = vec_mergel(a,b): + // The even elements of the result are obtained left-to-right, + // from the low elements of a. + // The odd elements of the result are obtained left-to-right, + // from the low elements of b. + + // Example, starting with: + // v[0]: 00 01 02 03 04 05 06 07 + // v[1]: 10 11 12 13 14 15 16 17 + // v[2]: 20 21 22 23 24 25 26 27 + // v[3]: 30 31 32 33 34 35 36 37 + // v[4]: 40 41 42 43 44 45 46 47 + // v[5]: 50 51 52 53 54 55 56 57 + // v[6]: 60 61 62 63 64 65 66 67 + // v[7]: 70 71 72 73 74 75 76 77 + + int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + b0 = vec_mergeh(v[0], v[4]); + b1 = vec_mergel(v[0], v[4]); + b2 = vec_mergeh(v[1], v[5]); + b3 = vec_mergel(v[1], v[5]); + b4 = vec_mergeh(v[2], v[6]); + b5 = vec_mergel(v[2], v[6]); + b6 = vec_mergeh(v[3], v[7]); + b7 = vec_mergel(v[3], v[7]); + + // After first merge operation + // b0: 00 40 01 41 02 42 03 43 + // b1: 04 44 05 45 06 46 07 47 + // b2: 10 50 11 51 12 52 13 53 + // b3: 14 54 15 55 16 56 17 57 + // b4: 20 60 21 61 22 62 23 63 + // b5: 24 64 25 65 26 66 27 67 + // b6: 30 70 31 71 32 62 33 73 + // b7: 34 74 35 75 36 76 37 77 + + c0 = vec_mergeh(b0, b4); + c1 = vec_mergel(b0, b4); + c2 = vec_mergeh(b1, b5); + c3 = vec_mergel(b1, b5); + c4 = vec_mergeh(b2, b6); + c5 = vec_mergel(b2, b6); + c6 = vec_mergeh(b3, b7); + c7 = vec_mergel(b3, b7); + + // After second merge operation + // c0: 00 20 40 60 01 21 41 61 + // c1: 02 22 42 62 03 23 43 63 + // c2: 04 24 44 64 05 25 45 65 + // c3: 06 26 46 66 07 27 47 67 + // c4: 10 30 50 70 11 31 51 71 + // c5: 12 32 52 72 13 33 53 73 + // c6: 14 34 54 74 15 35 55 75 + // c7: 16 36 56 76 17 37 57 77 + + v[0] = vec_mergeh(c0, c4); + v[1] = vec_mergel(c0, c4); + v[2] = vec_mergeh(c1, c5); + v[3] = vec_mergel(c1, c5); + v[4] = vec_mergeh(c2, c6); + v[5] = vec_mergel(c2, c6); + v[6] = vec_mergeh(c3, c7); + v[7] = vec_mergel(c3, c7); + + // After last merge operation + // v[0]: 00 10 20 30 40 50 60 70 + // v[1]: 01 11 21 31 41 51 61 71 + // v[2]: 02 12 22 32 42 52 62 72 + // v[3]: 03 13 23 33 43 53 63 73 + // v[4]: 04 14 24 34 44 54 64 74 + // v[5]: 05 15 25 35 45 55 65 75 + // v[6]: 06 16 26 36 46 56 66 76 + // v[7]: 07 17 27 37 47 57 67 77 +} + +#endif // VPX_DSP_PPC_TRANSPOSE_VSX_H_ diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h new file mode 100644 index 0000000..2f3aa20 --- /dev/null +++ b/vpx_dsp/ppc/types_vsx.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_PPC_TYPES_VSX_H_ +#define VPX_DSP_PPC_TYPES_VSX_H_ + +#include + +typedef vector signed short int16x8_t; +typedef vector unsigned short uint16x8_t; +typedef vector signed int int32x4_t; + +#endif // VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ca6e5ca..4e57bef 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -264,11 +264,12 @@ endif DSP_SRCS-yes += avg.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm endif +DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c endif # CONFIG_VP9_ENCODER @@ -337,6 +338,11 @@ endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC # Neon utilities DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h +# PPC VSX utilities +DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h +DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 536269e..2404e22 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -908,22 +908,21 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; specialize qw/vpx_minmax_8x8 sse2 neon msa/; - if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; + specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon/; + specialize qw/vpx_hadamard_16x16 sse2 neon vsx/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd sse2 neon/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64"; + specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon msa/; + specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; specialize qw/vpx_satd sse2 neon msa/; -- 2.7.4