#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
-static const FuncInfo dct_lsx_func_info[2] = {
+static const FuncInfo dct_lsx_func_info[4] = {
+ { &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+ { &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
{ &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
16, 1 },
{ &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
INSTANTIATE_TEST_SUITE_P(
LSX, TransDCT,
- ::testing::Combine(::testing::Range(0, 2),
+ ::testing::Combine(::testing::Range(0, 4),
::testing::Values(dct_lsx_func_info),
::testing::Values(0), ::testing::Values(VPX_BITS_8)));
#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
&vpx_idct8x8_64_add_vsx,
0, VPX_BITS_8)));
#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
+ ::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
+ &vpx_idct8x8_64_add_c, 0,
+ VPX_BITS_8)));
+#endif // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+ { \
+ __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3; \
+ \
+ DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1); \
+ DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3); \
+ _t0 = __lsx_vilvl_h(_s1, _s0); \
+ _t1 = __lsx_vilvh_h(_s1, _s0); \
+ _t2 = __lsx_vilvl_h(_s3, _s2); \
+ _t3 = __lsx_vilvh_h(_s3, _s2); \
+ DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2); \
+ DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3); \
+ }
+
#if !CONFIG_VP9_HIGHBITDEPTH
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
int32_t src_stride) {
__lsx_vst(in7, output, 240);
}
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3;
+
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+
+ in0 = __lsx_vld(input, 0);
+ DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+ in3 = __lsx_vldx(input, src_stride6);
+
+ /* fdct4 pre-process */
+ {
+ __m128i vec, mask;
+ __m128i zero = __lsx_vldi(0);
+
+ mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+ DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+ in3);
+ vec = __lsx_vseqi_h(in0, 0);
+ vec = __lsx_vxori_b(vec, 255);
+ vec = __lsx_vand_v(mask, vec);
+ in0 = __lsx_vadd_h(in0, vec);
+ }
+
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ int32_t src_stride2 = src_stride << 1;
+ int32_t src_stride4 = src_stride2 << 1;
+ int32_t src_stride6 = src_stride4 + src_stride2;
+ int16_t *input_tmp = (int16_t *)input;
+
+ in0 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+ in2);
+ in3 = __lsx_vldx(input_tmp, src_stride6);
+ input_tmp += src_stride4;
+ in4 = __lsx_vld(input_tmp, 0);
+ DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+ in6);
+ in7 = __lsx_vldx(input_tmp, src_stride6);
+
+ DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+ DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+ __lsx_vst(in0, output, 0);
+ __lsx_vst(in1, output, 16);
+ __lsx_vst(in2, output, 32);
+ __lsx_vst(in3, output, 48);
+ __lsx_vst(in4, output, 64);
+ __lsx_vst(in5, output, 80);
+ __lsx_vst(in6, output, 96);
+ __lsx_vst(in7, output, 112);
+}
+
void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
int32_t src_stride) {
int32_t i;
#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
#include "vpx_dsp/txfm_common.h"
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m; \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
+ __m128i vec4_m, vec5_m, vec6_m, vec7_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df }; \
+ \
+ LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \
+ DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m); \
+ cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m); \
+ vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m); \
+ cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m); \
+ vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m); \
+ cnst2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m); \
+ vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m); \
+ \
+ DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m, \
+ vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+ vec7_m, DCT_CONST_BITS, out0, out2, out1, out3); \
+ }
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \
+ __m128i s7_m, x0_m, x1_m, x2_m, x3_m; \
+ __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 }; \
+ \
+ /* FDCT stage1 */ \
+ LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+ s2_m, s3_m, s4_m, s5_m, s6_m, s7_m); \
+ LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \
+ DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m); \
+ x2_m = __lsx_vneg_h(x2_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 2); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2); \
+ \
+ /* stage2 */ \
+ s1_m = __lsx_vilvl_h(s5_m, s6_m); \
+ s0_m = __lsx_vilvh_h(s5_m, s6_m); \
+ \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m); \
+ \
+ /* stage3 */ \
+ LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \
+ \
+ /* stage4 */ \
+ DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \
+ DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m); \
+ x1_m = __lsx_vpackev_h(x0_m, x1_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1); \
+ \
+ DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m); \
+ x2_m = __lsx_vpackev_h(x3_m, x2_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5); \
+ \
+ x1_m = __lsx_vreplvei_h(coeff_m, 5); \
+ x0_m = __lsx_vneg_h(x0_m); \
+ x0_m = __lsx_vpackev_h(x1_m, x0_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7); \
+ x2_m = __lsx_vreplvei_h(coeff_m, 6); \
+ x3_m = __lsx_vneg_h(x3_m); \
+ x2_m = __lsx_vpackev_h(x2_m, x3_m); \
+ DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3); \
+ }
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \
+ { \
+ __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+ \
+ DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m, \
+ vec1_m, vec2_m, vec3_m); \
+ DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m, \
+ vec5_m, vec6_m, vec7_m); \
+ DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, \
+ in3, in0, in1, in2, in3); \
+ DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, \
+ in7, in4, in5, in6, in7); \
+ }
+
#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
{ \
__m128i tp0_m, tp1_m; \
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
} else {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4 neon sse2 msa/;
+ specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2 neon/;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+ specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct8x8_1 sse2 neon msa/;