Add vp8_variance8x8_armv6 and vp8_sub_pixel_variance8x8_armv6 functions

author Attila Nagy <attilanagy@google.com>

Wed, 9 Mar 2011 12:26:24 +0000 (14:26 +0200)

committer Attila Nagy <attilanagy@google.com>

Tue, 15 Mar 2011 13:50:44 +0000 (15:50 +0200)
author Attila Nagy <attilanagy@google.com>
Wed, 9 Mar 2011 12:26:24 +0000 (14:26 +0200)
committer Attila Nagy <attilanagy@google.com>
Tue, 15 Mar 2011 13:50:44 +0000 (15:50 +0200)
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c

index 5ba14f3..a661a89 100644 (file)
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -35,15 +35,15 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
          cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
          cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/
  
-        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_armv6;
+        /*cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
          cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
          cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;
  
-        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_armv6;
+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
          cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
          cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
          cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_armv6;
diff --git a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm

new file mode 100644 (file)

index 0000000..7daecb9
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
@@ -0,0 +1,95 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END
diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c

index 64d76bc..ed1fb16 100644 (file)
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c
@@ -15,6 +15,34 @@
  
  #if HAVE_ARMV6
  
+unsigned int vp8_sub_pixel_variance8x8_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[10*8];
+    unsigned char  second_pass[8*8];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            9, 8, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             8, 8, 8, VFilter);
+
+    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
  unsigned int vp8_sub_pixel_variance16x16_armv6
  (
      const unsigned char  *src_ptr,
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h

index 7ad7c76..86de274 100644 (file)
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -16,7 +16,9 @@
  
  extern prototype_sad(vp8_sad16x16_armv6);
  extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_variance(vp8_variance8x8_armv6);
  extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_armv6);
  extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
  extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
  extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
@@ -30,12 +32,18 @@ extern prototype_variance(vp8_mse16x16_armv6);
  #undef  vp8_variance_subpixvar16x16
  #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
  
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_armv6
+
  #undef  vp8_variance_var16x16
  #define vp8_variance_var16x16 vp8_variance16x16_armv6
  
  #undef  vp8_variance_mse16x16
  #define vp8_variance_mse16x16 vp8_mse16x16_armv6
  
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_armv6
+
  #undef  vp8_variance_halfpixvar16x16_h
  #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
  
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk

index b07ee8f..a11e1ca 100644 (file)
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -38,6 +38,7 @@ VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
  VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
  VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
  VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
  VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
  
  #File list for neon
author	Attila Nagy <attilanagy@google.com>
	Wed, 9 Mar 2011 12:26:24 +0000 (14:26 +0200)
committer	Attila Nagy <attilanagy@google.com>
	Tue, 15 Mar 2011 13:50:44 +0000 (15:50 +0200)
vp8/encoder/arm/arm_csystemdependent.c		patch \| blob \| history
vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm	[new file with mode: 0644]	patch \| blob
vp8/encoder/arm/variance_arm.c		patch \| blob \| history
vp8/encoder/arm/variance_arm.h		patch \| blob \| history
vp8/vp8cx_arm.mk		patch \| blob \| history