make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \
- !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
- MMX, Trans4x4WHT,
- ::testing::Values(
- make_tuple(&vp9_fwht4x4_mmx, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#endif
-
-#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && \
- !CONFIG_EMULATE_HARDWARE
+#if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4WHT,
::testing::Values(
+ make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
#endif
specialize qw/vp10_fht16x16 sse2/;
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
+ specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
} else {
add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp10_fht4x4 sse2 msa/;
specialize qw/vp10_fht16x16 sse2 msa/;
add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
+ specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
}
# Inverse transform
+++ /dev/null
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
- paddw m0, m1
- movq m4, m0
- psubw m3, m2
- psubw m4, m3
- psraw m4, 1
- movq m5, m4
- psubw m5, m1 ;b1
- psubw m4, m2 ;c1
- psubw m0, m4
- paddw m3, m5
- ; m0 a0
- SWAP 1, 4 ; m1 c1
- SWAP 2, 3 ; m2 d1
- SWAP 3, 5 ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
- movq m4, m0
- movq m5, m2
- punpcklwd m4, m1
- punpckhwd m0, m1
- punpcklwd m5, m3
- punpckhwd m2, m3
- movq m1, m4
- movq m3, m0
- punpckldq m1, m5
- punpckhdq m4, m5
- punpckldq m3, m2
- punpckhdq m0, m2
- SWAP 2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
- lea r3q, [inputq + strideq*4]
- movq m0, [inputq] ;a1
- movq m1, [inputq + strideq*2] ;b1
- movq m2, [r3q] ;c1
- movq m3, [r3q + strideq*2] ;d1
-
- TRANSFORM_COLS
- TRANSPOSE_4X4
- TRANSFORM_COLS
- TRANSPOSE_4X4
-
- psllw m0, 2
- psllw m1, 2
- psllw m2, 2
- psllw m3, 2
-
-%if CONFIG_VP9_HIGHBITDEPTH
- pxor m4, m4
- pxor m5, m5
- pcmpgtw m4, m0
- pcmpgtw m5, m1
- movq m6, m0
- movq m7, m1
- punpcklwd m0, m4
- punpcklwd m1, m5
- punpckhwd m6, m4
- punpckhwd m7, m5
- movq [outputq], m0
- movq [outputq + 8], m6
- movq [outputq + 16], m1
- movq [outputq + 24], m7
- pxor m4, m4
- pxor m5, m5
- pcmpgtw m4, m2
- pcmpgtw m5, m3
- movq m6, m2
- movq m7, m3
- punpcklwd m2, m4
- punpcklwd m3, m5
- punpckhwd m6, m4
- punpckhwd m7, m5
- movq [outputq + 32], m2
- movq [outputq + 40], m6
- movq [outputq + 48], m3
- movq [outputq + 56], m7
-%else
- movq [outputq], m0
- movq [outputq + 8], m1
- movq [outputq + 16], m2
- movq [outputq + 24], m3
-%endif
-
- RET
--- /dev/null
+;
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp10
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; sign extension
+ mova m2, m0
+ mova m3, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ punpckhwd m2, m2
+ punpckhwd m3, m3
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ mova [outputq], m0
+ mova [outputq + 16], m2
+ mova [outputq + 32], m1
+ mova [outputq + 48], m3
+%else
+ mova [outputq], m0
+ mova [outputq + 16], m1
+%endif
+
+ RET
endif
ifeq ($(CONFIG_USE_X86INC),yes)
-VP10_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/error_sse2.asm
endif
endif
endif
-VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
specialize qw/vp9_fht16x16 sse2/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+ specialize qw/vp9_fwht4x4/, "$sse2_x86inc";
} else {
add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
specialize qw/vp9_fht4x4 sse2 msa/;
specialize qw/vp9_fht16x16 sse2 msa/;
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vp9_fwht4x4 msa/, "$mmx_x86inc";
+ specialize qw/vp9_fwht4x4 msa/, "$sse2_x86inc";
}
#
+++ /dev/null
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
- paddw m0, m1
- movq m4, m0
- psubw m3, m2
- psubw m4, m3
- psraw m4, 1
- movq m5, m4
- psubw m5, m1 ;b1
- psubw m4, m2 ;c1
- psubw m0, m4
- paddw m3, m5
- ; m0 a0
- SWAP 1, 4 ; m1 c1
- SWAP 2, 3 ; m2 d1
- SWAP 3, 5 ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
- movq m4, m0
- movq m5, m2
- punpcklwd m4, m1
- punpckhwd m0, m1
- punpcklwd m5, m3
- punpckhwd m2, m3
- movq m1, m4
- movq m3, m0
- punpckldq m1, m5
- punpckhdq m4, m5
- punpckldq m3, m2
- punpckhdq m0, m2
- SWAP 2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
- lea r3q, [inputq + strideq*4]
- movq m0, [inputq] ;a1
- movq m1, [inputq + strideq*2] ;b1
- movq m2, [r3q] ;c1
- movq m3, [r3q + strideq*2] ;d1
-
- TRANSFORM_COLS
- TRANSPOSE_4X4
- TRANSFORM_COLS
- TRANSPOSE_4X4
-
- psllw m0, 2
- psllw m1, 2
- psllw m2, 2
- psllw m3, 2
-
-%if CONFIG_VP9_HIGHBITDEPTH
- pxor m4, m4
- pxor m5, m5
- pcmpgtw m4, m0
- pcmpgtw m5, m1
- movq m6, m0
- movq m7, m1
- punpcklwd m0, m4
- punpcklwd m1, m5
- punpckhwd m6, m4
- punpckhwd m7, m5
- movq [outputq], m0
- movq [outputq + 8], m6
- movq [outputq + 16], m1
- movq [outputq + 24], m7
- pxor m4, m4
- pxor m5, m5
- pcmpgtw m4, m2
- pcmpgtw m5, m3
- movq m6, m2
- movq m7, m3
- punpcklwd m2, m4
- punpcklwd m3, m5
- punpckhwd m6, m4
- punpckhwd m7, m5
- movq [outputq + 32], m2
- movq [outputq + 40], m6
- movq [outputq + 48], m3
- movq [outputq + 56], m7
-%else
- movq [outputq], m0
- movq [outputq + 8], m1
- movq [outputq + 16], m2
- movq [outputq + 24], m3
-%endif
-
- RET
--- /dev/null
+;
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+; TODO(linfeng): The duplication with vp10 should be resolved.
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; sign extension
+ mova m2, m0
+ mova m3, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ punpckhwd m2, m2
+ punpckhwd m3, m3
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ mova [outputq], m0
+ mova [outputq + 16], m2
+ mova [outputq + 32], m1
+ mova [outputq + 48], m3
+%else
+ mova [outputq], m0
+ mova [outputq + 16], m1
+%endif
+
+ RET
endif
ifeq ($(CONFIG_USE_X86INC),yes)
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
endif
endif
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c