QPEL_TABLE 10, 4, w, sse4
QPEL_TABLE 12, 4, w, sse4
+%define MAX_PB_SIZE 64
+
%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
%if ARCH_X86_64
movdqa [%1], %2
%endmacro
-%macro LOOP_END 4
- lea %1q, [%1q+2*%2q] ; dst += dststride
- add %3q, %4q ; src += srcstride
+%macro LOOP_END 3
+ add %1q, 2*MAX_PB_SIZE ; dst += dststride
+ add %2q, %3q ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
%endmacro
; ******************************
%macro HEVC_PUT_HEVC_PEL_PIXELS 2
-cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
+cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
pxor m2, m2
.loop
SIMPLE_LOAD %1, %2, srcq, m0
MC_PIXEL_COMPUTE %1, %2
PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
-cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
- pxor m2, m2
+cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
.loop
SIMPLE_LOAD %1, %2, srcq, m0
PEL_%2STORE%1 dstq, m0, m1
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
+cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
pxor m2, m2
movdqa m5, [pw_bi_%2]
.loop
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
%macro HEVC_PUT_HEVC_EPEL 2
-cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, height, mx, rfilter
+cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 6, dst, src, srcstride, height, mx, rfilter
%assign %%stride ((%2 + 7)/8)
EPEL_FILTER %2, mx, m4, m5
.loop
EPEL_LOAD %2, srcq-%%stride, %%stride, %1
EPEL_COMPUTE %2, %1, m4, m5
PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx, rfilter
+cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 7, dst, dststride, src, srcstride, src2, height, mx, rfilter
movdqa m6, [pw_bi_%2]
EPEL_FILTER %2, mx, m4, m5
.loop
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
; int16_t* mcbuffer)
; ******************************
-cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, height, r3src, my, rfilter
+cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 6, dst, src, srcstride, height, r3src, my, rfilter
lea r3srcq, [srcstrideq*3]
sub srcq, srcstrideq
EPEL_FILTER %2, my, m4, m5
EPEL_LOAD %2, srcq, srcstride, %1
EPEL_COMPUTE %2, %1, m4, m5
PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
RET
-cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my, rfilter
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
lea r3srcq, [srcstrideq*3]
movdqa m6, [pw_bi_%2]
sub srcq, srcstrideq
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
; ******************************
%macro HEVC_PUT_HEVC_EPEL_HV 2
-cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 12 , dst, src, srcstride, height, mx, my, r3src, rfilter
%assign %%stride ((%2 + 7)/8)
sub srcq, srcstrideq
EPEL_HV_FILTER %2
movdqa m4, m5
movdqa m5, m6
movdqa m6, m7
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
RET
-cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
%assign %%stride ((%2 + 7)/8)
sub srcq, srcstrideq
EPEL_HV_FILTER %2
movdqa m6, m7
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
; ******************************
%macro HEVC_PUT_HEVC_QPEL 2
-cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 15, dst, src, srcstride, height, mx, rfilter
QPEL_FILTER %2, mx
.loop
QPEL_H_LOAD %2, srcq, %1, 10
packssdw m0, m1
%endif
PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, src2, src2stride, height, mx, rfilter
+cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
movdqa m9, [pw_bi_%2]
QPEL_FILTER %2, mx
.loop
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
; int width, int height, int mx, int my)
; ******************************
-cglobal hevc_put_hevc_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
+cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 15, dst, src, srcstride, height, r3src, my, rfilter
lea r3srcq, [srcstrideq*3]
QPEL_FILTER %2, my
.loop
- QPEL_V_LOAD %2, srcq, srcstride, %1, r8
+ QPEL_V_LOAD %2, srcq, srcstride, %1, r7
QPEL_COMPUTE %1, %2
%if %2 > 8
packssdw m0, m1
%endif
PEL_10STORE%1 dstq, m0, m1
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter
QPEL_V_LOAD %2, srcq, srcstride, %1, r8
QPEL_COMPUTE %1, %2
%if %2 > 8
- packusdw m0, m1
+ packssdw m0, m1
%endif
UNI_COMPUTE %1, %2, m0, m1, m9
PEL_%2STORE%1 dstq, m0, m1
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, r3src, my, rfilter
+cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
movdqa m9, [pw_bi_%2]
lea r3srcq, [srcstrideq*3]
QPEL_FILTER %2, my
.loop
SIMPLE_BILOAD %1, src2q, m10, m11
- QPEL_V_LOAD %2, srcq, srcstride, %1, r10
+ QPEL_V_LOAD %2, srcq, srcstride, %1, r9
QPEL_COMPUTE %1, %2
%if %2 > 8
packssdw m0, m1
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
; int height, int mx, int my)
; ******************************
%macro HEVC_PUT_HEVC_QPEL_HV 2
-cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 12, dst, src, srcstride, height, mx, my, r3src, rfilter
lea mxq, [mxq*8-8]
lea myq, [myq*8-8]
lea r3srcq, [srcstrideq*3]
movdqa m13, m14
movdqa m14, m15
%endif
- LOOP_END dst, dststride, src, srcstride
+ LOOP_END dst, src, srcstride
RET
cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
lea mxq, [mxq*8-8]
lea myq, [myq*8-8]
lea r3srcq, [srcstrideq*3]
%endif
add dstq, dststrideq ; dst += dststride
add srcq, srcstrideq ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
%define SHIFT denomd
%endif
lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
+%if %1 <= 4
+ pxor m1, m1
+%endif
movd m2, wxm ; WX
movd m4, SHIFT ; shift
+%if %1 <= 4
+ punpcklwd m2, m1
+%else
punpcklwd m2, m2
+%endif
dec SHIFT
movdqu m5, [one_per_32]
movd m6, SHIFT
%endif
.loop
SIMPLE_LOAD %1, 10, srcq, m0
+%if %1 <= 4
+ punpcklwd m0, m1
+ pmaddwd m0, m2
+ paddd m0, m5
+ psrad m0, m4
+ paddd m0, m3
+%else
pmulhw m6, m0, m2
pmullw m0, m2
punpckhwd m1, m0, m6
psrad m1, m4
paddd m0, m3
paddd m1, m3
+%endif
packusdw m0, m1
%if %2 == 8
packuswb m0, m0
%endif
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
- lea srcq, [srcq+2*srcstrideq] ; src += srcstride
+ add srcq, 2*MAX_PB_SIZE ; src += srcstride
dec heightd ; cmp height
jnz .loop ; height loop
RET
-cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
+cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
mov r6d, denomm
+%if %1 <= 4
+ pxor m1, m1
+%endif
movd m2, wx0m ; WX0
lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
movd m3, wx1m ; WX1
movd m0, r6d ; shift
+%if %1 <= 4
+ punpcklwd m2, m1
+ punpcklwd m3, m1
+%else
punpcklwd m2, m2
- inc r6d
punpcklwd m3, m3
+%endif
+ inc r6d
movd m5, r6d ; shift+1
pshufd m2, m2, 0
mov r6d, ox0m
.loop
SIMPLE_LOAD %1, 10, srcq, m0
SIMPLE_LOAD %1, 10, src2q, m8
+%if %1 <= 4
+ punpcklwd m0, m1
+ punpcklwd m8, m1
+ pmaddwd m0, m3
+ pmaddwd m8, m2
+ paddd m0, m4
+ paddd m0, m8
+ psrad m0, m5
+%else
pmulhw m6, m0, m3
pmullw m0, m3
pmulhw m7, m8, m2
paddd m1, m4
psrad m0, m5
psrad m1, m5
+%endif
packusdw m0, m1
%if %2 == 8
packuswb m0, m0
%endif
PEL_%2STORE%1 dstq, m0, m1
add dstq, dststrideq ; dst += dststride
- lea srcq, [srcq+2*srcstrideq] ; src += srcstride
- lea src2q, [src2q+2*src2strideq] ; src2 += srcstride
+ add srcq, 2*MAX_PB_SIZE ; src += srcstride
+ add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
dec r6d ; cmp height
jnz .loop ; height loop
RET