From effaa3263dcfaa21f1d13cb5f62f587ca8ba2d00 Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Fri, 10 May 2013 11:04:43 -0700 Subject: [PATCH] Removing unused simple loopfilter code. Change-Id: Ic11dc052fb641687c015e1bbc37181b9babcd43e --- vp9/common/vp9_loopfilter_filters.c | 34 --- vp9/common/vp9_rtcd_defs.sh | 24 -- vp9/common/x86/vp9_loopfilter_intrin_mmx.c | 16 -- vp9/common/x86/vp9_loopfilter_intrin_sse2.c | 16 -- vp9/common/x86/vp9_loopfilter_mmx.asm | 343 -------------------------- vp9/common/x86/vp9_loopfilter_sse2.asm | 366 ---------------------------- vp9/common/x86/vp9_loopfilter_x86.h | 8 - 7 files changed, 807 deletions(-) diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index 15785f5..bf97589 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -282,29 +282,6 @@ static INLINE void simple_filter(int8_t mask, *op0 = signed_char_clamp(p0 + filter2) ^ 0x80; } -void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p, - const uint8_t *blimit) { - int i = 0; - - do { - const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p], - s[0 * p], s[1 * p]); - simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); - ++s; - } while (++i < 16); -} - -void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p, - const uint8_t *blimit) { - int i = 0; - - do { - const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); - simple_filter(mask, s - 2, s - 1, s, s + 1); - s += p; - } while (++i < 16); -} - /* Vertical MB Filtering */ void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr, uint8_t *v_ptr, int y_stride, int uv_stride, @@ -392,11 +369,6 @@ void vp9_loop_filter_bh8x8_c(uint8_t *y, uint8_t *u, uint8_t *v, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) { - vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit); -} void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v, int y_stride, int uv_stride, @@ -413,12 +385,6 @@ void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) { - vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit); -} - static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, uint8_t flat, uint8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 75e3604..45ae4fe 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -128,30 +128,6 @@ specialize vp9_loop_filter_bh sse2 prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi" specialize vp9_loop_filter_bh8x8 sse2 -prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_mbv mmx sse2 -vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c -vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx -vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2 - -prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_mbh mmx sse2 -vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c -vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx -vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2 - -prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_bv mmx sse2 -vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c -vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx -vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2 - -prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_bh mmx sse2 -vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c -vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx -vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2 - prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi" specialize vp9_lpf_mbh_w sse2 diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c index 2be9e31..7e6c4be 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c @@ -35,16 +35,6 @@ void vp9_loop_filter_bh_mmx(unsigned char *y_ptr, } -void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, - y_stride, blimit); -} - /* Vertical B Filtering */ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -66,9 +56,3 @@ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); -} diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 08447a6..7982ca6 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -1115,16 +1115,6 @@ void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, v_ptr + 4 * uv_stride); } -void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, - y_stride, blimit); -} - /* Vertical B Filtering */ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -1143,9 +1133,3 @@ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr, v_ptr + 4); } -void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); -} diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm index ceffdf5..4ebb51b 100644 --- a/vp9/common/x86/vp9_loopfilter_mmx.asm +++ b/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -593,349 +593,6 @@ sym(vp9_loop_filter_vertical_edge_mmx): pop rbp ret - -;void vp9_loop_filter_simple_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE -sym(vp9_loop_filter_simple_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - mov rcx, 2 ; count -.nexts8_h: - mov rdx, arg(2) ;blimit ; get blimit - movq mm3, [rdx] ; - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movq mm1, [rsi+2*rax] ; p1 - movq mm0, [rdi] ; q1 - movq mm2, mm1 - movq mm7, mm0 - movq mm4, mm0 - psubusb mm0, mm1 ; q1-=p1 - psubusb mm1, mm4 ; p1-=q1 - por mm1, mm0 ; abs(p1-q1) - pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm1, 1 ; abs(p1-q1)/2 - - movq mm5, [rsi+rax] ; p0 - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - movq mm6, mm5 ; p0 - psubusb mm5, mm4 ; p0-=q0 - psubusb mm4, mm6 ; q0-=p0 - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm3, mm3 - pcmpeqb mm5, mm3 - - ; start work on filters - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) - pand mm5, mm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - movq mm1, mm5 ; get a copy of filters - psraw mm1, 11 ; arithmetic shift right 11 - psllw mm1, 8 ; shift left 8 to put it back - - por mm0, mm1 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .nexts8_h - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE -sym(vp9_loop_filter_simple_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4- 2]; ; - mov rcx, 2 ; count -.nexts8_v: - - lea rdi, [rsi + rax]; - movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 - - movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - - movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 - movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 - - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - movq mm5, mm4 ; 53 43 52 42 51 41 50 40 - - punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 - punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 - - neg rax - - movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 - movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 - - punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 - movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 - - movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 - punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 - - movq mm2, mm0 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 13 03 12 02 11 01 10 00 - - punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 - movq mm3, mm2 ; 33 23 13 03 32 22 12 02 - - punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 - punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 - - punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 - - - ; calculate mask - movq mm6, mm0 ; p1 - movq mm7, mm3 ; q1 - psubusb mm7, mm6 ; q1-=p1 - psubusb mm6, mm3 ; p1-=q1 - por mm6, mm7 ; abs(p1-q1) - pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm6, 1 ; abs(p1-q1)/2 - - movq mm5, mm1 ; p0 - movq mm4, mm2 ; q0 - - psubusb mm5, mm2 ; p0-=q0 - psubusb mm4, mm1 ; q0-=p0 - - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] - - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm7, mm7 - pcmpeqb mm5, mm7 ; mm5 = mask - - ; start work on filters - movq t0, mm0 - movq t1, mm3 - - pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm0, mm3 ; p1 - q1 - movq mm6, mm1 ; p0 - - movq mm7, mm2 ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm7 ; offseted ; q0 - - psubsb mm7, mm6 ; q0 - p0 - paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) - - pand mm5, mm0 ; mask filter values we don't care about - - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - movq mm7, mm5 ; get a copy of filters - psraw mm7, 11 ; arithmetic shift right 11 - psllw mm7, 8 ; shift left 8 to put it back - - por mm0, mm7 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0sz add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - - movq mm0, t0 - movq mm4, t1 - - ; mm0 = 70 60 50 40 30 20 10 00 - ; mm6 = 71 61 51 41 31 21 11 01 - ; mm3 = 72 62 52 42 32 22 12 02 - ; mm4 = 73 63 53 43 33 23 13 03 - ; transpose back to write out - - movq mm1, mm0 ; - punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 - - punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 - movq mm2, mm3 ; - - punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 - movq mm5, mm1 ; 71 70 61 60 51 50 41 40 - - punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 - movq mm6, mm0 ; 31 30 21 20 11 10 01 00 - - punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 - punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 - - movd [rsi+rax*4], mm0 ; write 03 02 01 00 - punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 - - psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 - punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 - - movd [rdi+rax*4], mm0 ; write 13 12 11 10 - movd [rsi+rax*2], mm6 ; write 23 22 21 20 - - psrlq mm6, 32 ; 33 32 31 30 - movd [rsi], mm1 ; write 43 42 41 40 - - movd [rsi + rax], mm6 ; write 33 32 31 30 - neg rax - - movd [rsi + rax*2], mm5 ; write 63 62 61 60 - psrlq mm1, 32 ; 53 52 51 50 - - movd [rdi], mm1 ; write out 53 52 51 50 - psrlq mm5, 32 ; 73 72 71 70 - - movd [rdi + rax*2], mm5 ; write 73 72 71 70 - - lea rsi, [rsi+rax*8] ; next 8 - - dec rcx - jnz .nexts8_v - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, -; int y_stride, -; loop_filter_info *lfi) -;{ -; -; -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -;} - SECTION_RODATA align 16 tfe: diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm index ae4c60f..74236cf 100644 --- a/vp9/common/x86/vp9_loopfilter_sse2.asm +++ b/vp9/common/x86/vp9_loopfilter_sse2.asm @@ -845,372 +845,6 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2): pop rbp ret -;void vp9_loop_filter_simple_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE -sym(vp9_loop_filter_simple_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;blimit - movdqa xmm3, XMMWORD PTR [rdx] - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movdqa xmm1, [rsi+2*rax] ; p1 - movdqa xmm0, [rdi] ; q1 - movdqa xmm2, xmm1 - movdqa xmm7, xmm0 - movdqa xmm4, xmm0 - psubusb xmm0, xmm1 ; q1-=p1 - psubusb xmm1, xmm4 ; p1-=q1 - por xmm1, xmm0 ; abs(p1-q1) - pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm1, 1 ; abs(p1-q1)/2 - - movdqa xmm5, [rsi+rax] ; p0 - movdqa xmm4, [rsi] ; q0 - movdqa xmm0, xmm4 ; q0 - movdqa xmm6, xmm5 ; p0 - psubusb xmm5, xmm4 ; p0-=q0 - psubusb xmm4, xmm6 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm3, xmm3 - pcmpeqb xmm5, xmm3 - - ; start work on filters - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb xmm2, xmm7 ; p1 - q1 - - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) - pand xmm5, xmm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - movdqa xmm1, xmm5 ; get a copy of filters - psraw xmm1, 11 ; arithmetic shift right 11 - psllw xmm1, 8 ; shift left 8 to put it back - - por xmm0, xmm1 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0 add - pxor xmm3, [GLOBAL(t80)] ; unoffset - movdqa [rsi], xmm3 ; write back - - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result - - - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset - movdqa [rsi+rax], xmm6 ; write back - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE -sym(vp9_loop_filter_simple_vertical_edge_sse2): - push rbp ; save old base pointer value. - mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx ; save callee-saved reg - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi - 2 ] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 - movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 - movd xmm2, [rdi] ; 13 12 11 10 - movd xmm3, [rcx] ; 53 52 51 50 - punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 - punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 - - movd xmm4, [rsi + rax*2] ; 23 22 21 20 - movd xmm5, [rdx + rax*2] ; 63 62 61 60 - movd xmm6, [rdi + rax*2] ; 33 32 31 30 - movd xmm7, [rcx + rax*2] ; 73 72 71 70 - punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 - punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 - - punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 - punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - - movdqa xmm2, xmm0 - punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - movdqa t0, xmm0 ; save to t0 - movdqa t1, xmm2 ; save to t1 - - lea rsi, [rsi + rax*8] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm4, [rsi] ; 83 82 81 80 - movd xmm1, [rdx] ; c3 c2 c1 c0 - movd xmm6, [rdi] ; 93 92 91 90 - movd xmm3, [rcx] ; d3 d2 d1 d0 - punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 - punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - - movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 - movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 - movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 - punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 - punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 - - punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 - punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 - - movdqa xmm1, xmm4 - punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - - movdqa xmm6, xmm4 - punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - - movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - - punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - ; calculate mask - movdqa xmm6, xmm0 ; p1 - movdqa xmm7, xmm3 ; q1 - psubusb xmm7, xmm0 ; q1-=p1 - psubusb xmm6, xmm3 ; p1-=q1 - por xmm6, xmm7 ; abs(p1-q1) - pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm6, 1 ; abs(p1-q1)/2 - - movdqa xmm5, xmm1 ; p0 - movdqa xmm4, xmm2 ; q0 - psubusb xmm5, xmm2 ; p0-=q0 - psubusb xmm4, xmm1 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit - movdqa xmm7, XMMWORD PTR [rdx] - - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm7, xmm7 - pcmpeqb xmm5, xmm7 ; mm5 = mask - - ; start work on filters - movdqa t0, xmm0 - movdqa t1, xmm3 - - pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb xmm0, xmm3 ; p1 - q1 - movdqa xmm6, xmm1 ; p0 - - movdqa xmm7, xmm2 ; q0 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm7 ; offseted ; q0 - - psubsb xmm7, xmm6 ; q0 - p0 - paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) - - pand xmm5, xmm0 ; mask filter values we don't care about - - - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - - movdqa xmm7, xmm5 ; get a copy of filters - psraw xmm7, 11 ; arithmetic shift right 11 - - psllw xmm7, 8 ; shift left 8 to put it back - por xmm0, xmm7 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0sz add - pxor xmm3, [GLOBAL(t80)] ; unoffset q0 - - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - movdqa xmm0, xmm5 ; get a copy of filters - - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result - - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset p0 - - movdqa xmm0, t0 ; p1 - movdqa xmm4, t1 ; q1 - - ; transpose back to write out - ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - movdqa xmm5, xmm3 - punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - - ; write out order: xmm0 xmm2 xmm1 xmm3 - lea rdx, [rsi + rax*4] - - movd [rsi], xmm1 ; write the second 8-line result - psrldq xmm1, 4 - movd [rdi], xmm1 - psrldq xmm1, 4 - movd [rsi + rax*2], xmm1 - psrldq xmm1, 4 - movd [rdi + rax*2], xmm1 - - movd [rdx], xmm3 - psrldq xmm3, 4 - movd [rcx], xmm3 - psrldq xmm3, 4 - movd [rdx + rax*2], xmm3 - psrldq xmm3, 4 - movd [rcx + rax*2], xmm3 - - neg rax - lea rsi, [rsi + rax*8] - neg rax - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd [rsi], xmm0 ; write the first 8-line result - psrldq xmm0, 4 - movd [rdi], xmm0 - psrldq xmm0, 4 - movd [rsi + rax*2], xmm0 - psrldq xmm0, 4 - movd [rdi + rax*2], xmm0 - - movd [rdx], xmm2 - psrldq xmm2, 4 - movd [rcx], xmm2 - psrldq xmm2, 4 - movd [rdx + rax*2], xmm2 - psrldq xmm2, 4 - movd [rcx + rax*2], xmm2 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - SECTION_RODATA align 16 tfe: diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h index 46a6202..fb5af05 100644 --- a/vp9/common/x86/vp9_loopfilter_x86.h +++ b/vp9/common/x86/vp9_loopfilter_x86.h @@ -23,10 +23,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx); extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx); extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx); extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx); #endif #if HAVE_SSE2 @@ -34,10 +30,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2); extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2); extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2); extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2); #endif #endif // LOOPFILTER_X86_H -- 2.7.4