Fix: NEON copy/extend frame for small sizes
authorAttila Nagy <attilanagy@google.com>
Fri, 14 Oct 2011 11:17:24 +0000 (14:17 +0300)
committerAttila Nagy <attilanagy@google.com>
Mon, 17 Oct 2011 11:42:37 +0000 (14:42 +0300)
NEON version of copyframeyonly, extendframeborders, copy_frame_func were
not working for plane stride < 128 and/or y_width < 128.

Change-Id: Id6c2e6c795274da0c90134b15c0d5f62d1b17a6c

vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
vpx_scale/arm/neon/yv12extend_arm.c

index e6bb486..e55d076 100644 (file)
@@ -18,7 +18,8 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+;void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+;                                   YV12_BUFFER_CONFIG *dst_ybc);
 
 |vp8_yv12_copy_frame_func_neon| PROC
     push            {r4 - r11, lr}
@@ -52,7 +53,8 @@ cp_src_to_dst_height_loop
     mov             r9, r3
     add             r10, r2, r6
     add             r11, r3, r7
-    mov             r12, r5, lsr #7
+    movs            r12, r5, lsr #7
+    ble             extra_cp_needed   ; y_width < 128
 
 cp_src_to_dst_width_loop
     vld1.8          {q0, q1}, [r8]!
@@ -83,6 +85,7 @@ cp_src_to_dst_width_loop
 
     bne             cp_src_to_dst_height_loop
 
+extra_cp_needed
     ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
     sub             r11, r5, r10
     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
@@ -110,7 +113,8 @@ cp_src_to_dst_height_uv_loop
     mov             r9, r3
     add             r10, r2, r6
     add             r11, r3, r7
-    mov             r12, r5, lsr #6
+    movs            r12, r5, lsr #6
+    ble             extra_uv_cp_needed
 
 cp_src_to_dst_width_uv_loop
     vld1.8          {q0, q1}, [r8]!
@@ -133,6 +137,7 @@ cp_src_to_dst_width_uv_loop
 
     bne             cp_src_to_dst_height_uv_loop
 
+extra_uv_cp_needed
     ands            r10, r5, #0x3f                  ;check to see if extra copy is needed
     sub             r11, r5, r10
     ldr             r2, [sp]        ;srcptr1
index febccc2..1f8b4a6 100644 (file)
@@ -42,7 +42,8 @@ cp_src_to_dst_height_loop
     mov             r9, r3
     add             r10, r2, r6
     add             r11, r3, r7
-    mov             r12, r5, lsr #7
+    movs            r12, r5, lsr #7
+    ble             extra_cp_needed   ; y_width < 128
 
 cp_src_to_dst_width_loop
     vld1.8          {q0, q1}, [r8]!
@@ -73,6 +74,7 @@ cp_src_to_dst_width_loop
 
     bne             cp_src_to_dst_height_loop
 
+extra_cp_needed
     ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
     sub             r11, r5, r10
     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
@@ -419,7 +421,8 @@ cp_src_to_dst_height_loop1
     mov             r9, r3
     add             r10, r2, r6
     add             r11, r3, r7
-    mov             r12, r5, lsr #7
+    movs            r12, r5, lsr #7
+    ble             extra_copy_needed   ; y_width < 128
 
 cp_src_to_dst_width_loop1
     vld1.8          {q0, q1}, [r8]!
@@ -450,6 +453,7 @@ cp_src_to_dst_width_loop1
 
     bne             cp_src_to_dst_height_loop1
 
+extra_copy_needed
     ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
     sub             r11, r5, r10
     ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
index 8444b8e..ebc4242 100644 (file)
@@ -75,12 +75,13 @@ copy_left_right_y
     mul             r8, r4, lr              ; plane_height * plane_stride
 
     ; copy width is plane_stride
-    mov             r12, lr, lsr #7         ; plane_stride / 128
+    movs            r12, lr, lsr #7         ; plane_stride / 128
 
     sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border
     add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
     sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
     sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
+    ble             extra_y_copy_needed     ; plane stride < 128
 
 copy_top_bottom_y
     vld1.8          {q0, q1}, [r1]!
@@ -119,6 +120,7 @@ top_bottom_32
     subs            r12, r12, #1
     bne             copy_top_bottom_y
 
+extra_y_copy_needed
     mov             r7, lr, lsr #4          ; check to see if extra copy is needed
     ands            r7, r7, #0x7
     bne             extra_top_bottom_y
@@ -184,12 +186,13 @@ copy_left_right_uv
 ;Now copy the top and bottom source lines into each line of the respective borders
     mov             r1, r7
     mul             r8, r4, lr              ; plane_height * plane_stride
-    mov             r12, lr, lsr #6         ; plane_stride / 64
+    movs            r12, lr, lsr #6         ; plane_stride / 64
 
     sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border
     add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
     sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
     sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
+    ble             extra_uv_copy_needed    ; plane_stride < 64
 
 copy_top_bottom_uv
     vld1.8          {q0, q1}, [r1]!
@@ -219,7 +222,7 @@ top_bottom_16
 
     subs            r12, r12, #1
     bne             copy_top_bottom_uv
-
+extra_uv_copy_needed
     mov             r7, lr, lsr #3          ; check to see if extra copy is needed
     ands            r7, r7, #0x7
     bne             extra_top_bottom_uv
index d7a8289..7529fc6 100644 (file)
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpxscale.h"
 
-void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
+                                          YV12_BUFFER_CONFIG *dst_ybc);
 
-void
-vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
+void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
+                              YV12_BUFFER_CONFIG *dst_ybc)
 {
     vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
-    //printf("Border:%d; plane_stride:%d; plane_height:%d; plane_width:%d\n",dst_ybc->border,dst_ybc->y_stride,dst_ybc->y_height,dst_ybc->y_width);
 
-    vp8_yv12_extend_frame_borders_ptr(dst_ybc);
+    vp8_yv12_extend_frame_borders_neon(dst_ybc);
 }