Update armv6 loopfilter to new interface
authorAttila Nagy <attilanagy@google.com>
Mon, 11 Jul 2011 09:39:18 +0000 (12:39 +0300)
committerAttila Nagy <attilanagy@google.com>
Tue, 12 Jul 2011 09:14:51 +0000 (12:14 +0300)
Change-Id: I5fe581d797571a7a9432fbd17fc557591d0c1afa

vp8/common/arm/arm_systemdependent.c
vp8/common/arm/armv6/loopfilter_v6.asm
vp8/common/arm/armv6/simpleloopfilter_v6.asm
vp8/common/arm/loopfilter_arm.c
vp8/common/arm/loopfilter_arm.h

index bd5c075..8896cf0 100644 (file)
@@ -51,9 +51,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
         rtcd->loopfilter.normal_b_v  = vp8_loop_filter_bv_armv6;
         rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
         rtcd->loopfilter.normal_b_h  = vp8_loop_filter_bh_armv6;
-        rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
+        rtcd->loopfilter.simple_mb_v =
+                vp8_loop_filter_simple_vertical_edge_armv6;
         rtcd->loopfilter.simple_b_v  = vp8_loop_filter_bvs_armv6;
-        rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
+        rtcd->loopfilter.simple_mb_h =
+                vp8_loop_filter_simple_horizontal_edge_armv6;
         rtcd->loopfilter.simple_b_h  = vp8_loop_filter_bhs_armv6;
 
         rtcd->recon.copy16x16   = vp8_copy_mem16x16_v6;
index c7441b0..1cbbbcd 100644 (file)
@@ -53,14 +53,11 @@ count       RN  r5
 
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
+;r2     const char *blimit,
 ;r3     const char *limit,
 ;stack  const char *thresh,
 ;stack  int  count
 
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
-
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@@ -72,14 +69,18 @@ count       RN  r5
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |Hnext8|
     ; vp8_filter_mask() function
@@ -275,14 +276,18 @@ count       RN  r5
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r9, [src], pstep            ; p3
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r10, [src], pstep           ; p2
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r11, [src], pstep           ; p1
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r6], #4                ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r6]                    ; thresh
+    orr         r2, r2, r2, lsl #8
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |MBHnext8|
 
@@ -584,15 +589,19 @@ count       RN  r5
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
     ldr         lr, [src], pstep
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |Vnext8|
 
@@ -855,18 +864,22 @@ count       RN  r5
     sub         sp, sp, #16                 ; create temp buffer
 
     ldr         r6, [src], pstep            ; load source data
-    ldr         r4, [r2], #4                ; flimit
+    ldrb        r4, [r2]                    ; blimit
     pld         [src, #23]
     ldr         r7, [src], pstep
-    ldr         r2, [r3], #4                ; limit
+    ldrb        r2, [r3]                    ; limit
     pld         [src, #23]
     ldr         r8, [src], pstep
-    uadd8       r4, r4, r4                  ; flimit * 2
-    ldr         r3, [r12], #4               ; thresh
+    orr         r4, r4, r4, lsl #8
+    ldrb        r3, [r12]                   ; thresh
+    orr         r2, r2, r2, lsl #8
     pld         [src, #23]
     ldr         lr, [src], pstep
     mov         count, count, lsl #1        ; 4-in-parallel
-    uadd8       r4, r4, r2                  ; flimit * 2 + limit
+    orr         r4, r4, r4, lsl #16
+    orr         r3, r3, r3, lsl #8
+    orr         r2, r2, r2, lsl #16
+    orr         r3, r3, r3, lsl #16
 
 |MBVnext8|
     ; vp8_filter_mask() function
@@ -906,6 +919,7 @@ count       RN  r5
     str         lr, [sp, #8]
     ldr         lr, [src], pstep
 
+
     TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
 
     ldr         lr, [sp, #8]                ; load back (f)limit accumulator
@@ -954,6 +968,7 @@ count       RN  r5
     beq         mbvskip_filter               ; skip filtering
 
 
+
     ;vp8_hevmask() function
     ;calculate high edge variance
 
@@ -1121,6 +1136,7 @@ count       RN  r5
     smlabb      r8, r6, lr, r7
     smlatb      r6, r6, lr, r7
     smlabb      r9, r10, lr, r7
+
     smlatb      r10, r10, lr, r7
     ssat        r8, #8, r8, asr #7
     ssat        r6, #8, r6, asr #7
index 40a71f4..5e00cf0 100644 (file)
     MEND
 
 
+
 src         RN  r0
 pstep       RN  r1
 
 ;r0     unsigned char *src_ptr,
 ;r1     int src_pixel_step,
-;r2     const char *flimit,
-;r3     const char *limit,
-;stack  const char *thresh,
-;stack  int  count
-
-; All 16 elements in flimit are equal. So, in the code, only one load is needed
-; for flimit. Same applies to limit. thresh is not used in simple looopfilter
+;r2     const char *blimit
 
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
-    ldr         r12, [r3]                   ; limit
+    ldrb        r12, [r2]                   ; blimit
     ldr         r3, [src, -pstep, lsl #1]   ; p1
     ldr         r4, [src, -pstep]           ; p0
     ldr         r5, [src]                   ; q0
     ldr         r6, [src, pstep]            ; q1
-    ldr         r7, [r2]                    ; flimit
+    orr         r12, r12, r12, lsl #8       ; blimit
     ldr         r2, c0x80808080
-    ldr         r9, [sp, #40]               ; count for 8-in-parallel
-    uadd8       r7, r7, r7                  ; flimit * 2
-    mov         r9, r9, lsl #1              ; double the count. we're doing 4 at a time
-    uadd8       r12, r7, r12                ; flimit * 2 + limit
+    orr         r12, r12, r12, lsl #16      ; blimit
+    mov         r9, #4                      ; double the count. we're doing 4 at a time
     mov         lr, #0                      ; need 0 in a couple places
 
 |simple_hnext8|
@@ -148,34 +141,32 @@ pstep       RN  r1
 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
     stmdb       sp!, {r4 - r11, lr}
 
-    ldr         r12, [r2]                   ; r12: flimit
+    ldrb        r12, [r2]                   ; r12: blimit
     ldr         r2, c0x80808080
-    ldr         r7, [r3]                    ; limit
+    orr         r12, r12, r12, lsl #8
 
     ; load soure data to r7, r8, r9, r10
     ldrh        r3, [src, #-2]
     pld         [src, #23]                  ; preload for next block
     ldrh        r4, [src], pstep
-    uadd8       r12, r12, r12               ; flimit * 2
+    orr         r12, r12, r12, lsl #16
 
     ldrh        r5, [src, #-2]
     pld         [src, #23]
     ldrh        r6, [src], pstep
-    uadd8       r12, r12, r7                ; flimit * 2 + limit
 
     pkhbt       r7, r3, r4, lsl #16
 
     ldrh        r3, [src, #-2]
     pld         [src, #23]
     ldrh        r4, [src], pstep
-    ldr         r11, [sp, #40]              ; count (r11) for 8-in-parallel
 
     pkhbt       r8, r5, r6, lsl #16
 
     ldrh        r5, [src, #-2]
     pld         [src, #23]
     ldrh        r6, [src], pstep
-    mov         r11, r11, lsl #1            ; 4-in-parallel
+    mov         r11, #4                     ; double the count. we're doing 4 at a time
 
 |simple_vnext8|
     ; vp8_simple_filter_mask() function
index 1ec2b74..c841d45 100644 (file)
@@ -18,8 +18,6 @@ extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
-extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
 #endif
 
 #if HAVE_ARMV7
@@ -55,15 +53,6 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
         vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-}
-
 /* Vertical MB Filtering */
 void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                                int y_stride, int uv_stride, loop_filter_info *lfi)
@@ -77,15 +66,6 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
         vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                                int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
-}
-
 /* Horizontal B Filtering */
 void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
                               int y_stride, int uv_stride, loop_filter_info *lfi)
@@ -101,15 +81,12 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
         vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
+    vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
 }
 
 /* Vertical B Filtering */
@@ -127,15 +104,12 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
         vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                               int y_stride, int uv_stride, loop_filter_info *lfi)
+void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
+                               const unsigned char *blimit)
 {
-    (void) u_ptr;
-    (void) v_ptr;
-    (void) uv_stride;
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
-    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
+    vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
 }
 #endif
 
index 27159b5..390a547 100644 (file)
@@ -19,10 +19,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
 extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
-extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
+extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_lf_normal_mb_v
@@ -38,13 +38,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
 #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
 
 #undef  vp8_lf_simple_mb_v
-#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
+#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
 
 #undef  vp8_lf_simple_b_v
 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
 
 #undef  vp8_lf_simple_mb_h
-#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
+#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
 
 #undef  vp8_lf_simple_b_h
 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6