%macro INNER_LOOPFILTER 4
cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4
+%define dst_reg r0
+%define mstride_reg r1
+%define stride_reg r2
+%define E_reg r2
+%define I_reg r3
+%define hev_thr_reg r4
+%ifdef m8 ; x86-64, sse2
+%define dst8_reg r4
+%elif mmsize == 16 ; x86-32, sse2
+%define dst8_reg r5
+%else ; x86-32, mmx/mmxext
+%define cnt_reg r5
+%endif
+%define stride_reg E_reg
+%define dst2_reg I_reg
+%ifndef m8
+%define stack_reg hev_thr_reg
+%endif
+
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
; splat function arguments
- SPLATB_REG m0, r2, %1 ; E
- SPLATB_REG m1, r3, %1 ; I
- SPLATB_REG m2, r4, %1 ; hev_thresh
+ SPLATB_REG m0, E_reg, %1 ; E
+ SPLATB_REG m1, I_reg, %1 ; I
+ SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh
; align stack
- mov r4, rsp ; backup stack pointer
+ mov stack_reg, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
%ifidn %2, v
sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%define mask_res m12
; splat function arguments
- SPLATB_REG flim_E, r2, %1 ; E
- SPLATB_REG flim_I, r3, %1 ; I
- SPLATB_REG hev_thr, r4, %1 ; hev_thresh
+ SPLATB_REG flim_E, E_reg, %1 ; E
+ SPLATB_REG flim_I, I_reg, %1 ; I
+ SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh
%endif
%if mmsize == 8 ; mmx/mmxext
- mov r5, 2
+ mov cnt_reg, 2
%endif
- mov r2, r1
- neg r1
+ mov stride_reg, mstride_reg
+ neg mstride_reg
%ifidn %2, h
- lea r0, [r0+4*r2-4]
+ lea dst_reg, [dst_reg + stride_reg*4-4]
%endif
%if mmsize == 8
.next8px
%endif
; read
- lea r3, [r0+r2]
+ lea dst2_reg, [dst_reg + stride_reg]
%ifidn %2, v
- mova m0, [r0+r1*4] ; p3
- mova m1, [r3+r1*4] ; p2
- mova m2, [r0+r1*2] ; p1
- mova m5, [r3] ; q1
- mova m6, [r3+r2] ; q2
- mova m7, [r3+r2*2] ; q3
+ mova m0, [dst_reg +mstride_reg*4] ; p3
+ mova m1, [dst2_reg+mstride_reg*4] ; p2
+ mova m2, [dst_reg +mstride_reg*2] ; p1
+ mova m5, [dst2_reg] ; q1
+ mova m6, [dst2_reg+ stride_reg] ; q2
+ mova m7, [dst2_reg+ stride_reg*2] ; q3
%elif mmsize == 8 ; mmx/mmxext (h)
; read 8 rows of 8px each
- movu m0, [r0+r1*4]
- movu m1, [r3+r1*4]
- movu m2, [r0+r1*2]
- movu m3, [r0+r1]
- movu m4, [r0]
- movu m5, [r3]
- movu m6, [r3+r2]
+ movu m0, [dst_reg +mstride_reg*4]
+ movu m1, [dst2_reg+mstride_reg*4]
+ movu m2, [dst_reg +mstride_reg*2]
+ movu m3, [dst_reg +mstride_reg]
+ movu m4, [dst_reg]
+ movu m5, [dst2_reg]
+ movu m6, [dst2_reg+ stride_reg]
; 8x8 transpose
TRANSPOSE4x4B 0, 1, 2, 3, 7
%else
mova [rsp+mmsize*4], m1
%endif
- movu m7, [r3+r2*2]
+ movu m7, [dst2_reg+ stride_reg*2]
TRANSPOSE4x4B 4, 5, 6, 7, 1
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
SWAP 6, 3
SWAP 5, 3
%else ; sse2 (h)
- lea r5, [r0+r2*8]
+ lea dst8_reg, [dst_reg + stride_reg*8]
; read 16 rows of 8px each, interleave
- movh m0, [r0+r1*4]
- movh m1, [r5+r1*4]
- movh m2, [r0+r1*2]
- movh m5, [r5+r1*2]
- movh m3, [r0+r1]
- movh m6, [r5+r1]
- movh m4, [r0]
- movh m7, [r5]
+ movh m0, [dst_reg +mstride_reg*4]
+ movh m1, [dst8_reg+mstride_reg*4]
+ movh m2, [dst_reg +mstride_reg*2]
+ movh m5, [dst8_reg+mstride_reg*2]
+ movh m3, [dst_reg +mstride_reg]
+ movh m6, [dst8_reg+mstride_reg]
+ movh m4, [dst_reg]
+ movh m7, [dst8_reg]
punpcklbw m0, m1 ; A/I
punpcklbw m2, m5 ; C/K
punpcklbw m3, m6 ; D/L
punpcklbw m4, m7 ; E/M
- add r5, r2
- movh m1, [r3+r1*4]
- movh m6, [r5+r1*4]
- movh m5, [r3]
- movh m7, [r5]
+ add dst8_reg, stride_reg
+ movh m1, [dst2_reg+mstride_reg*4]
+ movh m6, [dst8_reg+mstride_reg*4]
+ movh m5, [dst2_reg]
+ movh m7, [dst8_reg]
punpcklbw m1, m6 ; B/J
punpcklbw m5, m7 ; F/N
- movh m6, [r3+r2]
- movh m7, [r5+r2]
+ movh m6, [dst2_reg+ stride_reg]
+ movh m7, [dst8_reg+ stride_reg]
punpcklbw m6, m7 ; G/O
; 8x16 transpose
%else
mova [rsp+mmsize*4], m1
%endif
- movh m7, [r3+r2*2]
- movh m1, [r5+r2*2]
+ movh m7, [dst2_reg+ stride_reg*2]
+ movh m1, [dst8_reg+ stride_reg*2]
punpcklbw m7, m1 ; H/P
TRANSPOSE4x4B 4, 5, 6, 7, 1
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP 7, 3 ; now m7 is zero
%ifidn %2, v
- mova m3, [r0+r1] ; p0
+ mova m3, [dst_reg +mstride_reg] ; p0
%elifdef m14
SWAP 3, 14
%else
SWAP 6, 4 ; now m6 is I
%ifidn %2, v
- mova m4, [r0] ; q0
+ mova m4, [dst_reg] ; q0
%elifdef m13
SWAP 4, 13
%else
; store
%ifidn %2, v
- mova [r0+r1*2], m2
- mova [r0+r1], m3
- mova [r0], m4
- mova [r0+r2], m5
+ mova [dst_reg+mstride_reg*2], m2
+ mova [dst_reg+mstride_reg ], m3
+ mova [dst_reg], m4
+ mova [dst_reg+ stride_reg ], m5
%else ; h
- add r0, 2
- add r3, 2
+ add dst_reg, 2
+ add dst2_reg, 2
; 4x8/16 transpose
TRANSPOSE4x4B 2, 3, 4, 5, 6
%if mmsize == 8 ; mmx/mmxext (h)
- WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2
+ WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg
%else ; sse2 (h)
lea r5, [r5+r1+2]
- WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2
+ WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg
%endif
%endif
%if mmsize == 8
%ifidn %2, h
- lea r0, [r0+8*r2-2]
+ lea dst_reg, [dst_reg + stride_reg*8-2]
%else ; v
- add r0, 8
+ add dst_reg, 8
%endif
- dec r5
+ dec cnt_reg
jg .next8px
%endif
%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
- mov rsp, r4 ; restore stack pointer
+ mov rsp, stack_reg ; restore stack pointer
%endif
RET
%endmacro
INNER_LOOPFILTER mmxext, h, 6, 8
INIT_XMM
INNER_LOOPFILTER sse2, v, 5, 13
+%ifdef m8
+INNER_LOOPFILTER sse2, h, 5, 15
+%else
INNER_LOOPFILTER sse2, h, 6, 15
+%endif