Code clean of highbd_tm_predictor_4x4
authorJian Zhou <zhoujian@google.com>
Sat, 19 Dec 2015 02:43:41 +0000 (18:43 -0800)
committerJian Zhou <zhoujian@google.com>
Sat, 19 Dec 2015 02:43:41 +0000 (18:43 -0800)
Replace MMX with SSE2, reduce mem access to left neighbor,
loop unrolled.

Change-Id: I941be915af809025f121ecc6c6443f73c9903e70

test/vp9_intrapred_test.cc
vpx_dsp/vpx_dsp_rtcd_defs.pl
vpx_dsp/x86/highbd_intrapred_sse2.asm

index ad3327e..66a23cb 100644 (file)
@@ -155,7 +155,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
@@ -176,7 +176,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest,
                                        &vpx_highbd_v_predictor_16x16_c, 16, 8),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 8),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 8),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 8)));
@@ -211,7 +211,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
@@ -233,7 +233,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest,
                                        &vpx_highbd_v_predictor_16x16_c, 16, 10),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 10),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 10),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 10)));
@@ -268,7 +268,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32,
                                        12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
@@ -290,7 +290,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest,
                                        &vpx_highbd_v_predictor_16x16_c, 16, 12),
                             make_tuple(&vpx_highbd_v_predictor_32x32_sse2,
                                        &vpx_highbd_v_predictor_32x32_c, 32, 12),
-                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse,
+                            make_tuple(&vpx_highbd_tm_predictor_4x4_sse2,
                                        &vpx_highbd_tm_predictor_4x4_c, 4, 12),
                             make_tuple(&vpx_highbd_tm_predictor_8x8_sse2,
                                        &vpx_highbd_tm_predictor_8x8_c, 8, 12)));
index 8979135..04c4b73 100644 (file)
@@ -291,7 +291,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_v_predictor_4x4/, "$sse_x86inc";
 
   add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc";
+  specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc";
 
   add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse_x86inc";
index b12d29c..712ab49 100644 (file)
@@ -261,43 +261,44 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   jnz .loop
   REP_RET
 
-INIT_MMX sse
-cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one
+INIT_XMM sse2
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
-  pshufw                m1, m1, 0x0
+  pshuflw               m1, m1, 0x0
+  movlhps               m0, m0         ; t1 t2 t3 t4 t1 t2 t3 t4
+  movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
-  mov                 oned, 1
-  movd                  m3, oned
+  pcmpeqw               m3, m3
   movd                  m4, bpsd
-  pshufw                m3, m3, 0x0
-  DEFINE_ARGS dst, stride, line, left
-  mov                lineq, -2
-  mova                  m2, m3
+  psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
-  add                leftq, 8
-  psubw                 m3, m2 ; max possible value
-  pxor                  m4, m4 ; min possible value
-  psubw                 m0, m1
-.loop:
-  movq                  m1, [leftq+lineq*4]
-  movq                  m2, [leftq+lineq*4+2]
-  pshufw                m1, m1, 0x0
-  pshufw                m2, m2, 0x0
-  paddw                 m1, m0
+  pcmpeqw               m2, m2
+  pxor                  m4, m4         ; min possible value
+  pxor                  m3, m2         ; max possible value
+  mova                  m1, [leftq]
+  pshuflw               m2, m1, 0x0
+  pshuflw               m5, m1, 0x55
+  movlhps               m2, m5         ; l1 l1 l1 l1 l2 l2 l2 l2
   paddw                 m2, m0
   ;Clamp to the bit-depth
-  pminsw                m1, m3
   pminsw                m2, m3
-  pmaxsw                m1, m4
   pmaxsw                m2, m4
   ;Store the values
-  movq    [dstq          ], m1
-  movq    [dstq+strideq*2], m2
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
   lea                 dstq, [dstq+strideq*4]
-  inc                lineq
-  jnz .loop
-  REP_RET
+  pshuflw               m2, m1, 0xaa
+  pshuflw               m5, m1, 0xff
+  movlhps               m2, m5
+  paddw                 m2, m0
+  ;Clamp to the bit-depth
+  pminsw                m2, m3
+  pmaxsw                m2, m4
+  ;Store the values
+  movq    [dstq          ], m2
+  movhpd  [dstq+strideq*2], m2
+  RET
 
 INIT_XMM sse2
 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one