From 664d9921b7feb77c4a2021e08fccf7d3fd13c40d Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Fri, 14 Oct 2011 14:17:24 +0300 Subject: [PATCH] Fix: NEON copy/extend frame for small sizes NEON version of copyframeyonly, extendframeborders, copy_frame_func were not working for plane stride < 128 and/or y_width < 128. Change-Id: Id6c2e6c795274da0c90134b15c0d5f62d1b17a6c --- vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm | 11 ++++++++--- vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm | 8 ++++++-- vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm | 9 ++++++--- vpx_scale/arm/neon/yv12extend_arm.c | 10 +++++----- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm index e6bb486..e55d076 100644 --- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm +++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm @@ -18,7 +18,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, +; YV12_BUFFER_CONFIG *dst_ybc); |vp8_yv12_copy_frame_func_neon| PROC push {r4 - r11, lr} @@ -52,7 +53,8 @@ cp_src_to_dst_height_loop mov r9, r3 add r10, r2, r6 add r11, r3, r7 - mov r12, r5, lsr #7 + movs r12, r5, lsr #7 + ble extra_cp_needed ; y_width < 128 cp_src_to_dst_width_loop vld1.8 {q0, q1}, [r8]! @@ -83,6 +85,7 @@ cp_src_to_dst_width_loop bne cp_src_to_dst_height_loop +extra_cp_needed ands r10, r5, #0x7f ;check to see if extra copy is needed sub r11, r5, r10 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 @@ -110,7 +113,8 @@ cp_src_to_dst_height_uv_loop mov r9, r3 add r10, r2, r6 add r11, r3, r7 - mov r12, r5, lsr #6 + movs r12, r5, lsr #6 + ble extra_uv_cp_needed cp_src_to_dst_width_uv_loop vld1.8 {q0, q1}, [r8]! @@ -133,6 +137,7 @@ cp_src_to_dst_width_uv_loop bne cp_src_to_dst_height_uv_loop +extra_uv_cp_needed ands r10, r5, #0x3f ;check to see if extra copy is needed sub r11, r5, r10 ldr r2, [sp] ;srcptr1 diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm index febccc2..1f8b4a6 100644 --- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm +++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm @@ -42,7 +42,8 @@ cp_src_to_dst_height_loop mov r9, r3 add r10, r2, r6 add r11, r3, r7 - mov r12, r5, lsr #7 + movs r12, r5, lsr #7 + ble extra_cp_needed ; y_width < 128 cp_src_to_dst_width_loop vld1.8 {q0, q1}, [r8]! @@ -73,6 +74,7 @@ cp_src_to_dst_width_loop bne cp_src_to_dst_height_loop +extra_cp_needed ands r10, r5, #0x7f ;check to see if extra copy is needed sub r11, r5, r10 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 @@ -419,7 +421,8 @@ cp_src_to_dst_height_loop1 mov r9, r3 add r10, r2, r6 add r11, r3, r7 - mov r12, r5, lsr #7 + movs r12, r5, lsr #7 + ble extra_copy_needed ; y_width < 128 cp_src_to_dst_width_loop1 vld1.8 {q0, q1}, [r8]! @@ -450,6 +453,7 @@ cp_src_to_dst_width_loop1 bne cp_src_to_dst_height_loop1 +extra_copy_needed ands r10, r5, #0x7f ;check to see if extra copy is needed sub r11, r5, r10 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm index 8444b8e..ebc4242 100644 --- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm +++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm @@ -75,12 +75,13 @@ copy_left_right_y mul r8, r4, lr ; plane_height * plane_stride ; copy width is plane_stride - mov r12, lr, lsr #7 ; plane_stride / 128 + movs r12, lr, lsr #7 ; plane_stride / 128 sub r1, r1, #32 ; src_ptr1 = y_buffer - Border add r6, r1, r8 ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride)) sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride sub r5, r1, lr, asl #5 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) + ble extra_y_copy_needed ; plane stride < 128 copy_top_bottom_y vld1.8 {q0, q1}, [r1]! @@ -119,6 +120,7 @@ top_bottom_32 subs r12, r12, #1 bne copy_top_bottom_y +extra_y_copy_needed mov r7, lr, lsr #4 ; check to see if extra copy is needed ands r7, r7, #0x7 bne extra_top_bottom_y @@ -184,12 +186,13 @@ copy_left_right_uv ;Now copy the top and bottom source lines into each line of the respective borders mov r1, r7 mul r8, r4, lr ; plane_height * plane_stride - mov r12, lr, lsr #6 ; plane_stride / 64 + movs r12, lr, lsr #6 ; plane_stride / 64 sub r1, r1, #16 ; src_ptr1 = u_buffer - Border add r6, r1, r8 ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride) sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) + ble extra_uv_copy_needed ; plane_stride < 64 copy_top_bottom_uv vld1.8 {q0, q1}, [r1]! @@ -219,7 +222,7 @@ top_bottom_16 subs r12, r12, #1 bne copy_top_bottom_uv - +extra_uv_copy_needed mov r7, lr, lsr #3 ; check to see if extra copy is needed ands r7, r7, #0x7 bne extra_top_bottom_uv diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c index d7a8289..7529fc6 100644 --- a/vpx_scale/arm/neon/yv12extend_arm.c +++ b/vpx_scale/arm/neon/yv12extend_arm.c @@ -13,13 +13,13 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpxscale.h" -void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc); -void -vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) +void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc); - //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width); - vp8_yv12_extend_frame_borders_ptr(dst_ybc); + vp8_yv12_extend_frame_borders_neon(dst_ybc); } -- 2.7.4