Use pmovmskb to skip quantize loops over empty coefficients.
authorRonald S. Bultje <rbultje@google.com>
Mon, 1 Jul 2013 19:03:20 +0000 (12:03 -0700)
committerRonald S. Bultje <rbultje@google.com>
Tue, 2 Jul 2013 23:34:24 +0000 (16:34 -0700)
If none of the 16 coefficients that we quantize per loop iteration
are larger than the zbin, directly skip to the next round of coeffs,
rather than doing a full quantize loop that will eventually result
in 16 zeroes. This incurs a jump cost, but saves a lot of other work.
32x32 quant goes from 1349 -> 1184 cycles. The same approach yielded
no significantly positive results for smaller transforms, so is not
used there (8x8: 103 -> 101 cycles; 16x16: 302 -> 306 cycles).

Change-Id: I8fca17dc2543fc8eed1dbcd5100145e3c3a9b647

vp9/encoder/x86/vp9_quantize_ssse3.asm

index b666abb..60f7991 100644 (file)
@@ -15,10 +15,10 @@ pw_1: times 8 dw 1
 
 SECTION .text
 
-%macro QUANTIZE_FN 1
-cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                               shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                               eob, scan, iscan
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
 
@@ -43,9 +43,8 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                            m4, [r2]                 ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-  mov                             r2, eobmp
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, eob, qcoeff, dqcoeff, iscan
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
   lea                         iscanq, [  iscanq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -119,6 +118,12 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                        r6, m7
+  pmovmskb                        r2, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
   paddw                           m6, m1                   ; m6 += round
   paddw                          m11, m1                   ; m11 += round
   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
@@ -159,16 +164,27 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmaxsw                          m8, m13
   add                        ncoeffq, mmsize
   jl .ac_only_loop
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*2+ 0], m5
+  mova        [qcoeffq+ncoeffq*2+16], m5
+  mova       [dqcoeffq+ncoeffq*2+ 0], m5
+  mova       [dqcoeffq+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
 
 .accumulate_eob:
   ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
   pshufd                          m7, m8, 0xe
   pmaxsw                          m8, m7
   pshuflw                         m7, m8, 0xe
   pmaxsw                          m8, m7
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
-  pextrw                      [eobq], m8, 0
+  pextrw                        [r2], m8, 0
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -194,5 +210,5 @@ cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
 %endmacro
 
 INIT_XMM ssse3
-QUANTIZE_FN b
-QUANTIZE_FN b_32x32
+QUANTIZE_FN b, 6
+QUANTIZE_FN b_32x32, 7