From: Fritz Koenig <frkoenig@google.com>
Date: Thu, 21 Oct 2010 17:53:15 +0000 (-0700)
Subject: FDCT optimizations.
X-Git-Tag: 1.0_branch~800
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5f0e0617bad80d06b7231ec11814e52d6f3edba8;p=profile%2Fivi%2Flibvpx.git

FDCT optimizations.

Fixed up the fdct for mmx and 8x4 sse2 to match them
most recent changes.

Change-Id: Ibee2d6c536fe14dcf75cd6eb1c73f4848a56d719
---

diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index 5acaca8..f07b030 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -11,511 +11,231 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-section .text
-    global sym(vp8_short_fdct4x4_mmx)
-    global sym(vp8_short_fdct8x4_wmt)
-
-
-%define         DCTCONSTANTSBITS         (16)
-%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
-%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
-%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
-%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
-
-
 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_mmx)
 sym(vp8_short_fdct4x4_mmx):
     push        rbp
-    mov         rbp, rsp
+    mov         rbp,        rsp
     SHADOW_ARGS_TO_STACK 3
     GET_GOT     rbx
-    push rsi
-    push rdi
+    push        rsi
+    push        rdi
     ; end prolog
-        mov     rsi,    arg(0) ;input
-        mov     rdi,    arg(1) ;output
-
-        lea     rdx,    [GLOBAL(dct_const_mmx)]
-        movsxd  rax,    dword ptr arg(2) ;pitch
-
-        lea     rcx,    [rsi + rax*2]
-        ; read the input data
-        movq    mm0,    [rsi]
-        movq    mm1,    [rsi + rax    ]
-
-        movq    mm2,    [rcx]
-        movq    mm3,    [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw   mm0,    3
-        psllw   mm1,    3
-
-        psllw   mm2,    3
-        psllw   mm3,    3
-
-        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
-
-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
-
-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
-
-
-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
-
-        punpckhdq   mm1,    mm2     ; 01 11 21 31
-
-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
-
-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
-
-
-        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
-
-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
-
-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
-
-
-        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
-
-        paddw   mm0,    mm1         ; a + b
-        psubw   mm2,    mm1         ; a - b
-
-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
-
-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
-
-        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
-
-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
-
-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
-
-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
-
-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
-
-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
-
-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
-
-
-        ; done with vertical
-        ; transpose for the second stage
-        movq    mm4,    mm0         ; 00 01 02 03
-        movq    mm5,    mm2         ; 10 11 12 03
-
-        punpcklwd   mm0,    mm1     ; 00 10 01 11
-        punpckhwd   mm4,    mm1     ; 02 12 03 13
-
-        punpcklwd   mm2,    mm3     ; 20 30 21 31
-        punpckhwd   mm5,    mm3     ; 22 32 23 33
-
-
-        movq        mm1,    mm0     ; 00 10 01 11
-        punpckldq   mm0,    mm2     ; 00 10 20 30
-
-        punpckhdq   mm1,    mm2     ; 01 11 21 31
-
-        movq        mm2,    mm4     ; 02 12 03 13
-        punpckldq   mm2,    mm5     ; 02 12 22 32
-
-        punpckhdq   mm4,    mm5     ; 03 13 23 33
-        movq        mm3,    mm4
-
-
-        ; first stage
-        movq    mm5,    mm0
-        movq    mm4,    mm1
 
-        paddw   mm0,    mm3         ; a = 0 + 3
-        paddw   mm1,    mm2         ; b = 1 + 2
+        mov         rsi,        arg(0)      ; input
+        mov         rdi,        arg(1)      ; output
 
-        psubw   mm4,    mm2         ; c = 1 - 2
-        psubw   mm5,    mm3         ; d = 0 - 3
+        movsxd      rax,        dword ptr arg(2) ;pitch
 
-
-        ; output 0 and 2
-        movq    mm6,    [rdx +  16] ; c2
-        movq    mm2,    mm0         ; a
-        paddw   mm0,    mm1         ; a + b
-
-        psubw   mm2,    mm1         ; a - b
-
-        movq    mm1,    mm0         ; a + b
-        pmulhw  mm0,    mm6         ; 00 01 02 03
-
-        paddw   mm0,    mm1         ; output 00 01 02 03
-        pmulhw  mm6,    mm2         ; 20 21 22 23
-
-        paddw   mm2,    mm6         ; output 20 21 22 23
-
-
-        ; output 1 and 3
-        movq    mm6,    [rdx +  8]  ; c1
-        movq    mm7,    [rdx + 24]  ; c3
-
-        movq    mm1,    mm4         ; c
-        movq    mm3,    mm5         ; d
-
-        pmulhw  mm1,    mm7         ; c * c3
-        pmulhw  mm3,    mm6         ; d * c1
-
-        paddw   mm3,    mm5         ; d * c1 rounded
-        paddw   mm1,    mm3         ; output 10 11 12 13
-
-        movq    mm3,    mm4         ; c
-        pmulhw  mm5,    mm7         ; d * c3
-
-        pmulhw  mm4,    mm6         ; c * c1
-        paddw   mm3,    mm4         ; round c* c1
-
-        psubw   mm5,    mm3         ; output 30 31 32 33
-        movq    mm3,    mm5
-        ; done with vertical
-
-        pcmpeqw mm4,    mm4
-        pcmpeqw mm5,    mm5
-        psrlw   mm4,    15
-        psrlw   mm5,    15
-
-        psllw   mm4,    2
-        psllw   mm5,    2
-
-        paddw   mm0,    mm4
-        paddw   mm1,    mm5
-        paddw   mm2,    mm4
-        paddw   mm3,    mm5
-
-        psraw   mm0, 3
-        psraw   mm1, 3
-        psraw   mm2, 3
-        psraw   mm3, 3
-
-        movq        [rdi   ],   mm0
-        movq        [rdi+ 8],   mm1
-        movq        [rdi+16],   mm2
-        movq        [rdi+24],   mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_short_fdct8x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-        mov         rsi,    arg(0) ;input
-        mov         rdi,    arg(1) ;output
-
-        lea         rdx,    [GLOBAL(dct_const_xmm)]
-        movsxd      rax,    dword ptr arg(2) ;pitch
-
-        lea         rcx,    [rsi + rax*2]
+        lea         rcx,        [rsi + rax*2]
         ; read the input data
-        movdqa      xmm0,       [rsi]
-        movdqa      xmm2,       [rsi + rax]
-
-        movdqa      xmm4,       [rcx]
-        movdqa      xmm3,       [rcx + rax]
-        ; get the constants
-        ;shift to left by 1 for prescision
-        psllw       xmm0,        3
-        psllw       xmm2,        3
-
-        psllw       xmm4,        3
-        psllw       xmm3,        3
-
-        ; transpose for the second stage
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
-        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
-
-        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
-        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
+        movq        mm0,        [rsi]
+        movq        mm1,        [rsi + rax]
 
-        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
-        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
+        movq        mm2,        [rcx]
+        movq        mm4,        [rcx + rax]
 
-        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
-        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
+        ; transpose for the first stage
+        movq        mm3,        mm0         ; 00 01 02 03
+        movq        mm5,        mm2         ; 20 21 22 23
 
-        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
+        punpcklwd   mm0,        mm1         ; 00 10 01 11
+        punpckhwd   mm3,        mm1         ; 02 12 03 13
 
+        punpcklwd   mm2,        mm4         ; 20 30 21 31
+        punpckhwd   mm5,        mm4         ; 22 32 23 33
 
-        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
-        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
+        movq        mm1,        mm0         ; 00 10 01 11
+        punpckldq   mm0,        mm2         ; 00 10 20 30
 
-        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
-        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
+        punpckhdq   mm1,        mm2         ; 01 11 21 31
 
-        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
-        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
+        movq        mm2,        mm3         ; 02 12 03 13
+        punpckldq   mm2,        mm5         ; 02 12 22 32
 
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
-        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
+        punpckhdq   mm3,        mm5         ; 03 13 23 33
 
-        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
-
-        ; xmm0 0
-        ; xmm1 1
-        ; xmm2 2
-        ; xmm3 3
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 3
 
         ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
+        movq        mm5,        mm0
+        movq        mm4,        mm1
 
+        paddw       mm0,        mm3         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
 
-        ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
+        psubw       mm4,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm3         ; d1 = 0 - 3
 
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
+        psllw       mm5,        3
+        psllw       mm4,        3
 
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+        psllw       mm0,        3
+        psllw       mm1,        3
 
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+        ; output 0 and 2
+        movq        mm2,        mm0         ; a1
 
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
+        paddw       mm0,        mm1         ; op[0] = a1 + b1
+        psubw       mm2,        mm1         ; op[2] = a1 - b1
 
         ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
-
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm4         ; c1 d1
+        punpckhwd   mm5,        mm4         ; c1 d1
 
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
+        movq        mm3,        mm1
+        movq        mm4,        mm5
 
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
 
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
 
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
+        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
 
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
+        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
 
+        packssdw    mm1,        mm4         ; op[1]
+        packssdw    mm3,        mm5         ; op[3]
 
         ; done with vertical
         ; transpose for the second stage
-        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
-        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
+        movq        mm4,        mm0         ; 00 10 20 30
+        movq        mm5,        mm2         ; 02 12 22 32
 
-        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
-        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
+        punpcklwd   mm0,        mm1         ; 00 01 10 11
+        punpckhwd   mm4,        mm1         ; 20 21 30 31
 
-        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
-        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
+        punpcklwd   mm2,        mm3         ; 02 03 12 13
+        punpckhwd   mm5,        mm3         ; 22 23 32 33
 
-        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
-        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+        movq        mm1,        mm0         ; 00 01 10 11
+        punpckldq   mm0,        mm2         ; 00 01 02 03
 
-        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
-        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
+        punpckhdq   mm1,        mm2         ; 01 22 12 13
 
-        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
+        movq        mm2,        mm4         ; 20 31 30 31
+        punpckldq   mm2,        mm5         ; 20 21 22 23
 
+        punpckhdq   mm4,        mm5         ; 30 31 32 33
 
-        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
-        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
+        ; mm0 0
+        ; mm1 1
+        ; mm2 2
+        ; mm3 4
 
-        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
-        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
+        movq        mm5,        mm0
+        movq        mm3,        mm1
 
-        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
-        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
+        paddw       mm0,        mm4         ; a1 = 0 + 3
+        paddw       mm1,        mm2         ; b1 = 1 + 2
 
-        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
-        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
+        psubw       mm3,        mm2         ; c1 = 1 - 2
+        psubw       mm5,        mm4         ; d1 = 0 - 3
 
-        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
+        pxor        mm6,        mm6         ; zero out for compare
 
-        ; first stage
-        movdqa      xmm5,       xmm0
-        movdqa      xmm4,       xmm1
-
-        paddw       xmm0,       xmm3         ; a = 0 + 3
-        paddw       xmm1,       xmm2         ; b = 1 + 2
-
-        psubw       xmm4,       xmm2         ; c = 1 - 2
-        psubw       xmm5,       xmm3         ; d = 0 - 3
+        pcmpeqw     mm6,        mm5         ; d1 != 0
 
+        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
+                                                                ; and keep bit 0 of lower
 
         ; output 0 and 2
-        movdqa      xmm6,       [rdx +  32] ; c2
-        movdqa      xmm2,       xmm0         ; a
+        movq        mm2,        mm0         ; a1
 
-        paddw       xmm0,       xmm1         ; a + b
-        psubw       xmm2,       xmm1         ; a - b
+        paddw       mm0,        mm1         ; a1 + b1
+        psubw       mm2,        mm1         ; a1 - b1
 
-        movdqa      xmm1,       xmm0         ; a + b
-        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
+        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
 
-        paddw       xmm0,       xmm1         ; output 00 01 02 03
-        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
 
-        paddw       xmm2,       xmm6         ; output 20 21 22 23
+        movq        MMWORD PTR[rdi + 0 ],  mm0
+        movq        MMWORD PTR[rdi + 16],  mm2
 
         ; output 1 and 3
-        movdqa      xmm6,       [rdx + 16]  ; c1
-        movdqa      xmm7,       [rdx + 48]  ; c3
+        ; interleave c1, d1
+        movq        mm1,        mm5         ; d1
+        punpcklwd   mm1,        mm3         ; c1 d1
+        punpckhwd   mm5,        mm3         ; c1 d1
 
-        movdqa      xmm1,       xmm4         ; c
-        movdqa      xmm3,       xmm5         ; d
+        movq        mm3,        mm1
+        movq        mm4,        mm5
 
-        pmulhw      xmm1,       xmm7         ; c * c3
-        pmulhw      xmm3,       xmm6         ; d * c1
+        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
 
-        paddw       xmm3,       xmm5         ; d * c1 rounded
-        paddw       xmm1,       xmm3         ; output 10 11 12 13
+        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
 
-        movdqa      xmm3,       xmm4         ; c
-        pmulhw      xmm5,       xmm7         ; d * c3
+        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
+        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
+        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
 
-        pmulhw      xmm4,       xmm6         ; c * c1
-        paddw       xmm3,       xmm4         ; round c* c1
+        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
 
-        psubw       xmm5,       xmm3         ; output 30 31 32 33
-        movdqa      xmm3,       xmm5
-        ; done with vertical
+        packssdw    mm1,        mm4         ; op[4]
+        packssdw    mm3,        mm5         ; op[12]
+
+        paddw       mm1,        mm6         ; op[4] += (d1!=0)
 
+        movq        MMWORD PTR[rdi + 8 ],  mm1
+        movq        MMWORD PTR[rdi + 24],  mm3
 
-        pcmpeqw     xmm4,       xmm4
-        pcmpeqw     xmm5,       xmm5;
-        psrlw       xmm4,       15
-        psrlw       xmm5,       15
-
-        psllw       xmm4,       2
-        psllw       xmm5,       2
-
-        paddw       xmm0,       xmm4
-        paddw       xmm1,       xmm5
-        paddw       xmm2,       xmm4
-        paddw       xmm3,       xmm5
-
-        psraw       xmm0,       3
-        psraw       xmm1,       3
-        psraw       xmm2,       3
-        psraw       xmm3,       3
-
-        movq        QWORD PTR[rdi   ],   xmm0
-        movq        QWORD PTR[rdi+ 8],   xmm1
-        movq        QWORD PTR[rdi+16],   xmm2
-        movq        QWORD PTR[rdi+24],   xmm3
-
-        psrldq      xmm0,       8
-        psrldq      xmm1,       8
-        psrldq      xmm2,       8
-        psrldq      xmm3,       8
-
-        movq        QWORD PTR[rdi+32],   xmm0
-        movq        QWORD PTR[rdi+40],   xmm1
-        movq        QWORD PTR[rdi+48],   xmm2
-        movq        QWORD PTR[rdi+56],   xmm3
-    ; begin epilog
-    pop rdi
-    pop rsi
+     ; begin epilog
+    pop         rdi
+    pop         rsi
     RESTORE_GOT
     UNSHADOW_ARGS
     pop         rbp
     ret
 
-
 SECTION_RODATA
-;static const unsigned int dct1st_stage_rounding_mmx[2] =
-align 16
-dct1st_stage_rounding_mmx:
-    times 2 dd 8192
-
-
-;static const unsigned int dct2nd_stage_rounding_mmx[2] =
-align 16
-dct2nd_stage_rounding_mmx:
-    times 2 dd 32768
-
-
-;static const short dct_matrix[4][4]=
-align 16
-dct_matrix:
-    times 4 dw 23170
-
-    dw  30274
-    dw  12540
-    dw -12540
-    dw -30274
-
-    dw 23170
-    times 2 dw -23170
-    dw 23170
-
-    dw  12540
-    dw -30274
-    dw  30274
-    dw -12540
-
-
-;static const unsigned short dct_const_mmx[4 * 4]=
-align 16
-dct_const_mmx:
-    times 4 dw 0
-    times 4 dw 60547
-    times 4 dw 46341
-    times 4 dw 25080
-
-
-;static const unsigned short dct_const_xmm[8 * 4]=
-align 16
-dct_const_xmm:
-    times 8 dw 0
-    times 8 dw 60547
-    times 8 dw 46341
-    times 8 dw 25080
+align 8
+_5352_2217:
+    dw 5352
+    dw 2217
+    dw 5352
+    dw 2217
+align 8
+_2217_neg5352:
+    dw 2217
+    dw -5352
+    dw 2217
+    dw -5352
+align 8
+_cmp_mask:
+    times 4 dw 1
+align 8
+_7w:
+    times 4 dw 7
+align 8
+_14500:
+    times 2 dd 14500
+align 8
+_7500:
+    times 2 dd 7500
+align 8
+_12000:
+    times 2 dd 12000
+align 8
+_51000:
+    times 2 dd 51000
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index 723a78d..652dd98 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -11,32 +11,68 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
-global sym(vp8_short_fdct4x4_sse2)
-sym(vp8_short_fdct4x4_sse2):
+%macro STACK_FRAME_CREATE 0
+%if ABI_IS_32BIT
+  %define       input       rsi
+  %define       output      rdi
+  %define       pitch       rax
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-;;    SAVE_XMM
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog
 
     mov         rsi, arg(0)
-    movsxd      rax, DWORD PTR arg(2)
-    lea         rdi, [rsi + rax*2]
+    mov         rdi, arg(1)
+
+    movsxd      rax, dword ptr arg(2)
+    lea         rcx, [rsi + rax*2]
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    %define     input       rcx
+    %define     output      rdx
+    %define     pitch       r8
+  %else
+    %define     input       rdi
+    %define     output      rsi
+    %define     pitch       rdx
+  %endif
+%endif
+%endmacro
+
+%macro STACK_FRAME_DESTROY 0
+  %define     input
+  %define     output
+  %define     pitch
+
+%if ABI_IS_32BIT
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    pop         rbp
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+  %endif
+%endif
+    ret
+%endmacro
+
+;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct4x4_sse2)
+sym(vp8_short_fdct4x4_sse2):
 
-    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
-    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
-    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
-    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
+    STACK_FRAME_CREATE
+
+    movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
+    movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
+    lea         input,          [input+2*pitch]
+    movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
+    movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
 
     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
 
-    mov         rdi, arg(1)
-
     movdqa      xmm2, xmm0
     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
@@ -51,6 +87,7 @@ sym(vp8_short_fdct4x4_sse2):
     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
+
     movdqa      xmm1, xmm0
     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
@@ -121,17 +158,216 @@ sym(vp8_short_fdct4x4_sse2):
     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
 
-    movdqa      XMMWORD PTR[rdi + 0], xmm0
-    movdqa      XMMWORD PTR[rdi + 16], xmm1
+    movdqa      XMMWORD PTR[output +  0], xmm0
+    movdqa      XMMWORD PTR[output + 16], xmm1
 
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-;;    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
+    STACK_FRAME_DESTROY
+
+;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_fdct8x4_sse2)
+sym(vp8_short_fdct8x4_sse2):
+
+    STACK_FRAME_CREATE
+
+        ; read the input data
+        movdqa      xmm0,       [input        ]
+        movdqa      xmm2,       [input+  pitch]
+        lea         input,      [input+2*pitch]
+        movdqa      xmm4,       [input        ]
+        movdqa      xmm3,       [input+  pitch]
+
+        ; transpose for the first stage
+        movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
+
+        movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm2        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        psllw       xmm5,        3
+        psllw       xmm4,        3
+
+        psllw       xmm0,        3
+        psllw       xmm1,        3
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; op[0] = a1 + b1
+        psubw       xmm2,       xmm1        ; op[2] = a1 - b1
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
+
+        psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
+        psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+        psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
+
+        packssdw    xmm1,       xmm4        ; op[1]
+        packssdw    xmm3,       xmm5        ; op[3]
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
+
+        ; xmm0 0
+        ; xmm1 4
+        ; xmm2 1
+        ; xmm3 3
+
+        movdqa      xmm5,       xmm0
+        movdqa      xmm2,       xmm1
+
+        paddw       xmm0,       xmm3        ; a1 = 0 + 3
+        paddw       xmm1,       xmm4        ; b1 = 1 + 2
+
+        psubw       xmm4,       xmm2        ; c1 = 1 - 2
+        psubw       xmm5,       xmm3        ; d1 = 0 - 3
+
+        pxor        xmm6,       xmm6        ; zero out for compare
+
+        pcmpeqw     xmm6,       xmm5        ; d1 != 0
+
+        pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
+                                                                    ; and keep bit 0 of lower
+
+        ; output 0 and 2
+        movdqa      xmm2,       xmm0        ; a1
+
+        paddw       xmm0,       xmm1        ; a1 + b1
+        psubw       xmm2,       xmm1        ; a1 - b1
+
+        paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
+        paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
+
+        psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
+        psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
+
+        ; output 1 and 3
+        ; interleave c1, d1
+        movdqa      xmm1,       xmm5        ; d1
+        punpcklwd   xmm1,       xmm4        ; c1 d1
+        punpckhwd   xmm5,       xmm4        ; c1 d1
+
+        movdqa      xmm3,       xmm1
+        movdqa      xmm4,       xmm5
+
+        pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+        pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
+
+        pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+        pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
+
+        paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
+        paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
+        paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
+
+        psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
+        psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+        psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
+
+        packssdw    xmm1,       xmm4        ; op[4]
+        packssdw    xmm3,       xmm5        ; op[12]
+
+        paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
+
+        movdqa      xmm4,       xmm0
+        movdqa      xmm5,       xmm2
+
+        punpcklqdq  xmm0,       xmm1
+        punpckhqdq  xmm4,       xmm1
+
+        punpcklqdq  xmm2,       xmm3
+        punpckhqdq  xmm5,       xmm3
+
+        movdqa      XMMWORD PTR[output + 0 ],  xmm0
+        movdqa      XMMWORD PTR[output + 16],  xmm2
+        movdqa      XMMWORD PTR[output + 32],  xmm4
+        movdqa      XMMWORD PTR[output + 48],  xmm5
+
+    STACK_FRAME_DESTROY
 
 SECTION_RODATA
 align 16
@@ -161,7 +397,9 @@ align 16
 _cmp_mask:
     times 4 dw 1
     times 4 dw 0
-
+align 16
+_cmp_mask8x4:
+    times 8 dw 1
 align 16
 _mult_sub:
     dw 1
@@ -176,6 +414,9 @@ align 16
 _7:
     times 4 dd 7
 align 16
+_7w:
+    times 8 dw 7
+align 16
 _14500:
     times 4 dd 14500
 align 16
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index 05824c6..59a5cb1 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -24,33 +24,31 @@ extern prototype_fdct(vp8_short_fdct4x4_mmx);
 extern prototype_fdct(vp8_short_fdct8x4_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 0
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
 
 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-#endif
 
 #endif
+
 #endif
 
 
 #if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_sse2);
 extern prototype_fdct(vp8_short_walsh4x4_sse2);
 
 extern prototype_fdct(vp8_short_fdct4x4_sse2);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
-#if 1
-/* short SSE2 DCT currently disabled, does not match the MMX version */
+
 #undef  vp8_fdct_short4x4
 #define vp8_fdct_short4x4 vp8_short_fdct4x4_sse2
 
 #undef  vp8_fdct_short8x4
 #define vp8_fdct_short8x4 vp8_short_fdct8x4_sse2
-#endif
 
 #undef  vp8_fdct_fast4x4
 #define vp8_fdct_fast4x4 vp8_short_fdct4x4_sse2
@@ -58,7 +56,7 @@ extern prototype_fdct(vp8_short_fdct4x4_sse2);
 #undef  vp8_fdct_fast8x4
 #define vp8_fdct_fast8x4 vp8_short_fdct8x4_sse2
 
-#undef vp8_fdct_walsh_short4x4
+#undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2
 
 #endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index fb1b37c..7810798 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,11 +18,10 @@
 #if HAVE_MMX
 void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
-    vp8_short_fdct4x4_c(input,   output,    pitch);
-    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
 }
 
-
 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
@@ -82,12 +81,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 #endif
 
 #if HAVE_SSE2
-void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
-{
-    vp8_short_fdct4x4_sse2(input,   output,    pitch);
-    vp8_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
@@ -249,18 +242,11 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
-#if 0 // new fdct
+
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
         cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
         cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
         cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
-#else
-        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_c;
-
-#endif
 
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;