call to this set of functions are replaced by var16x16.
Change-Id: I5ff1effc6c1358ea06cda1517b88ec28ef551b0d
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_armv6;
/*cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
- /*cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
- cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
+ /*cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;*/
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_neon;
/*cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
- cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_neon;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_neon;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_neon;
EXPORT |vp8_mse16x16_neon|
- EXPORT |vp8_get16x16pred_error_neon|
EXPORT |vp8_get4x4sse_cs_neon|
ARM
ENDP
-;============================
-; r0 unsigned char *src_ptr
-; r1 int src_stride
-; r2 unsigned char *ref_ptr
-; r3 int ref_stride
-|vp8_get16x16pred_error_neon| PROC
- vmov.i8 q8, #0 ;q8 - sum
- vmov.i8 q9, #0 ;q9, q10 - pred_error
- vmov.i8 q10, #0
-
- mov r12, #8
-
-get16x16pred_error_neon_loop
- vld1.8 {q0}, [r0], r1 ;Load up source and reference
- vld1.8 {q2}, [r2], r3
- vld1.8 {q1}, [r0], r1
- vld1.8 {q3}, [r2], r3
-
- vsubl.u8 q11, d0, d4
- vsubl.u8 q12, d1, d5
- vsubl.u8 q13, d2, d6
- vsubl.u8 q14, d3, d7
-
- vpadal.s16 q8, q11
- vmlal.s16 q9, d22, d22
- vmlal.s16 q10, d23, d23
-
- subs r12, r12, #1
-
- vpadal.s16 q8, q12
- vmlal.s16 q9, d24, d24
- vmlal.s16 q10, d25, d25
- vpadal.s16 q8, q13
- vmlal.s16 q9, d26, d26
- vmlal.s16 q10, d27, d27
- vpadal.s16 q8, q14
- vmlal.s16 q9, d28, d28
- vmlal.s16 q10, d29, d29
-
- bne get16x16pred_error_neon_loop
-
- vadd.u32 q10, q9, q10
- vpaddl.s32 q0, q8
-
- vpaddl.u32 q1, q10
- vadd.s64 d0, d0, d1
- vadd.u64 d1, d2, d3
-
- vmull.s32 q5, d0, d0
- vshr.s32 d10, d10, #8
- vsub.s32 d0, d1, d10
-
- vmov.32 r0, d0[0]
- bx lr
-
- ENDP
;=============================
; r0 unsigned char *src_ptr,
//extern prototype_getmbss(vp8_get_mb_ss_c);
extern prototype_variance(vp8_mse16x16_neon);
-extern prototype_get16x16prederror(vp8_get16x16pred_error_neon);
extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_neon
-#undef vp8_variance_get16x16prederror
-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon
-
#undef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
#endif
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;
- cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_c;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
-extern unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel);
extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
}
-unsigned int vp8_get16x16pred_error_c
-(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride
-)
-{
- unsigned pred_error = 0;
- int i, j;
- int sum = 0;
-
- for (i = 0; i < 16; i++)
- {
- int diff;
-
- for (j = 0; j < 16; j++)
- {
- diff = src_ptr[j] - ref_ptr[j];
- sum += diff;
- pred_error += diff * diff;
- }
-
- src_ptr += src_stride;
- ref_ptr += ref_stride;
- }
-
- pred_error -= sum * sum / 256;
- return pred_error;
-}
-
unsigned int vp8_get4x4sse_cs_c
(
{
rate2 += rate;
distortion2 = VARIANCE_INVOKE
- (&cpi->rtcd.variance, get16x16prederror)(
+ (&cpi->rtcd.variance, var16x16)(
x->src.y_buffer, x->src.y_stride,
- x->e_mbd.predictor, 16);
+ x->e_mbd.predictor, 16, &sse);
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
if (this_rd < best_intra_rd)
case TM_PRED:
RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
(&x->e_mbd);
- distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);
+ distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+ (x->src.y_buffer, x->src.y_stride,
+ x->e_mbd.predictor, 16, &sse);
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
int rate, best_rate = 0, distortion, best_distortion;
MB_PREDICTION_MODE mode, best_mode = DC_PRED;
int this_rd;
+ unsigned int sse;
x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
x->e_mbd.mode_info_context->mbmi.mode = mode;
RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
(&x->e_mbd);
- distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)
- (x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);
+ distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+ (x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, &sse);
rate = x->mbmode_cost[x->e_mbd.frame_type][mode];
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
// c imports
extern sub_pixel_variance_function sub_pixel_variance16x16_c;
extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride);
// ppc
vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc;
vp8_get_mb_ss = vp8_get_mb_ss_c;
- vp8_get16x16pred_error = vp8_get16x16pred_error_c;
vp8_get4x4sse_cs = vp8_get4x4sse_cs_c;
vp8_sad16x16 = vp8_sad16x16_ppc;
#endif
extern prototype_variance(vp8_variance_mse16x16);
-#ifndef vp8_variance_get16x16prederror
-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_c
-#endif
-extern prototype_get16x16prederror(vp8_variance_get16x16prederror);
-
#ifndef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c
#endif
vp8_getmbss_fn_t getmbss;
vp8_variance_fn_t mse16x16;
- vp8_get16x16prederror_fn_t get16x16prederror;
vp8_get16x16prederror_fn_t get4x4sse_cs;
vp8_sad_multi_fn_t sad16x16x3;
pop rbp
ret
-;unsigned int vp8_get16x16pred_error_mmx
-;(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride
-;)
-global sym(vp8_get16x16pred_error_mmx)
-sym(vp8_get16x16pred_error_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;DWORD PTR [src_ptr]
- mov rdi, arg(2) ;DWORD PTR [ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[src_stride]
- movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
-
- pxor mm0, mm0 ; clear xmm0 for unpack
- pxor mm7, mm7 ; clear xmm7 for accumulating diffs
-
- pxor mm6, mm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-var16loop:
-
- movq mm1, [rsi]
- movq mm2, [rdi]
-
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm2
- psubw mm3, mm4
-
- paddw mm7, mm1
- pmaddwd mm1, mm1
-
- paddw mm7, mm3
- pmaddwd mm3, mm3
-
- paddd mm6, mm1
- paddd mm6, mm3
-
-
- movq mm1, [rsi+8]
- movq mm2, [rdi+8]
-
- movq mm3, mm1
- movq mm4, mm2
-
- punpcklbw mm1, mm0
- punpckhbw mm3, mm0
-
- punpcklbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm2
- psubw mm3, mm4
-
- paddw mm7, mm1
- pmaddwd mm1, mm1
-
- paddw mm7, mm3
- pmaddwd mm3, mm3
-
- paddd mm6, mm1
- paddd mm6, mm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz var16loop
-
-
- movq mm1, mm6
- pxor mm6, mm6
-
- pxor mm5, mm5
- punpcklwd mm6, mm7
-
- punpckhwd mm5, mm7
- psrad mm5, 16
-
- psrad mm6, 16
- paddd mm6, mm5
-
- movq mm2, mm1
- psrlq mm1, 32
-
- paddd mm2, mm1
- movq mm7, mm6
-
- psrlq mm6, 32
- paddd mm6, mm7
-
- movd DWORD PTR [rsp], mm6 ;Sum
- movd DWORD PTR [rsp+4], mm2 ;SSE
-
- ; return (SSE-((Sum*Sum)>>8));
- movsxd rdx, dword ptr [rsp]
- imul rdx, rdx
- sar rdx, 8
- movsxd rax, dword ptr [rsp + 4]
- sub rax, rdx
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
SECTION_RODATA
;short mmx_bi_rd[4] = { 64, 64, 64, 64};
ret
-;unsigned int vp8_get16x16pred_error_sse2
-;(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride
-;)
-global sym(vp8_get16x16pred_error_sse2)
-sym(vp8_get16x16pred_error_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[src_stride]
- movsxd rdx, DWORD PTR arg(3) ;[ref_stride]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-var16peloop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
-
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
-
- punpcklbw xmm2, xmm0
- punpckhbw xmm4, xmm0
-
- psubw xmm1, xmm2
- psubw xmm3, xmm4
-
- paddw xmm7, xmm1
- pmaddwd xmm1, xmm1
-
- paddw xmm7, xmm3
- pmaddwd xmm3, xmm3
-
- paddd xmm6, xmm1
- paddd xmm6, xmm3
-
- add rsi, rax
- add rdi, rdx
-
- sub rcx, 1
- jnz var16peloop
-
-
- movdqa xmm1, xmm6
- pxor xmm6, xmm6
-
- pxor xmm5, xmm5
- punpcklwd xmm6, xmm7
-
- punpckhwd xmm5, xmm7
- psrad xmm5, 16
-
- psrad xmm6, 16
- paddd xmm6, xmm5
-
- movdqa xmm2, xmm1
- punpckldq xmm1, xmm0
-
- punpckhdq xmm2, xmm0
- movdqa xmm7, xmm6
-
- paddd xmm1, xmm2
- punpckldq xmm6, xmm0
-
- punpckhdq xmm7, xmm0
- paddd xmm6, xmm7
-
- movdqa xmm2, xmm1
- movdqa xmm7, xmm6
-
- psrldq xmm1, 8
- psrldq xmm6, 8
-
- paddd xmm7, xmm6
- paddd xmm1, xmm2
-
- movd DWORD PTR [rsp], xmm7 ;Sum
- movd DWORD PTR [rsp+4], xmm1 ;SSE
-
- ; return (SSE-((Sum*Sum)>>8));
- movsxd rdx, dword ptr [rsp]
- imul rdx, rdx
- sar rdx, 8
- movsxd rax, dword ptr [rsp + 4]
- sub rax, rdx
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
;unsigned int vp8_get8x8var_sse2
int *sum,
unsigned int *sumsquared
);
-extern unsigned int vp8_get16x16pred_error_mmx
-(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride
-);
unsigned int vp8_variance4x4_mmx(
unsigned int *SSE,
int *Sum
);
-unsigned int vp8_get16x16pred_error_sse2
-(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride
-);
unsigned int vp8_get8x8var_sse2
(
const unsigned char *src_ptr,
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
extern prototype_getmbss(vp8_get_mb_ss_mmx);
extern prototype_variance(vp8_mse16x16_mmx);
-extern prototype_get16x16prederror(vp8_get16x16pred_error_mmx);
extern prototype_variance2(vp8_get8x8var_mmx);
extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_mmx
-#undef vp8_variance_get16x16prederror
-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
-
#undef vp8_variance_get4x4sse_cs
#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
extern prototype_getmbss(vp8_get_mb_ss_sse2);
extern prototype_variance(vp8_mse16x16_wmt);
-extern prototype_get16x16prederror(vp8_get16x16pred_error_sse2);
extern prototype_variance2(vp8_get8x8var_sse2);
extern prototype_variance2(vp8_get16x16var_sse2);
#undef vp8_variance_mse16x16
#define vp8_variance_mse16x16 vp8_mse16x16_wmt
-#undef vp8_variance_get16x16prederror
-#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
-
#endif
#endif
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_mmx;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_mmx;
- cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
cpi->rtcd.variance.mse16x16 = vp8_mse16x16_wmt;
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_sse2;
- cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2;
-
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2;