From 5a2fd63a2af84b65f17c2c50fd9a100a653f7e76 Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Mon, 7 Nov 2011 13:40:01 +0200 Subject: [PATCH] ARMv6 optimized Intra4x4 prediction Added ARM optimized intra 4x4 prediction - 2x faster on Profiler compared to C-code compiled with -O3 - Function interface changed a little to improve BLOCKD structure access Change-Id: I9bc2b723155943fe0cf03dd9ca5f1760f7a81f54 --- vp8/common/arm/arm_systemdependent.c | 1 + vp8/common/arm/armv6/intra4x4_predict_v6.asm | 606 +++++++++++++++++++++++++++ vp8/common/arm/recon_arm.h | 4 + vp8/common/asm_com_offsets.c | 15 + vp8/common/generic/systemdependent.c | 2 +- vp8/common/recon.h | 5 +- vp8/common/reconintra4x4.c | 229 +++++----- vp8/decoder/decodframe.c | 3 +- vp8/encoder/encodeintra.c | 3 +- vp8/encoder/pickinter.c | 3 +- vp8/encoder/rdopt.c | 3 +- vp8/vp8_common.mk | 1 + 12 files changed, 751 insertions(+), 124 deletions(-) create mode 100644 vp8/common/arm/armv6/intra4x4_predict_v6.asm diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index 2928851..b5f194d 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -63,6 +63,7 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6; rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6; + rtcd->recon.intra4x4_predict = vp8_intra4x4_predict_armv6; } #endif diff --git a/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/vp8/common/arm/armv6/intra4x4_predict_v6.asm new file mode 100644 index 0000000..a974cd1 --- /dev/null +++ b/vp8/common/arm/armv6/intra4x4_predict_v6.asm @@ -0,0 +1,606 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_intra4x4_predict_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + +;void vp8_intra4x4_predict(unsigned char *src, int src_stride, int b_mode, +; unsigned char *dst, int dst_stride) + +|vp8_intra4x4_predict_armv6| PROC + push {r4-r12, lr} + + + cmp r2, #10 + addlt pc, pc, r2, lsl #2 ; position independent switch + pop {r4-r12, pc} ; default + b b_dc_pred + b b_tm_pred + b b_ve_pred + b b_he_pred + b b_ld_pred + b b_rd_pred + b b_vr_pred + b b_vl_pred + b b_hd_pred + b b_hu_pred + +b_dc_pred + ; load values + ldr r8, [r0, -r1] ; Above + ldrb r4, [r0, #-1]! ; Left[0] + mov r9, #0 + ldrb r5, [r0, r1] ; Left[1] + ldrb r6, [r0, r1, lsl #1]! ; Left[2] + usad8 r12, r8, r9 + ldrb r7, [r0, r1] ; Left[3] + + ; calculate dc + add r4, r4, r5 + add r4, r4, r6 + add r4, r4, r7 + add r4, r4, r12 + add r4, r4, #4 + ldr r0, [sp, #40] ; load stride + mov r12, r4, asr #3 ; (expected_dc + 4) >> 3 + + add r12, r12, r12, lsl #8 + add r3, r3, r0 + add r12, r12, r12, lsl #16 + + ; store values + str r12, [r3, -r0] + str r12, [r3] + str r12, [r3, r0] + str r12, [r3, r0, lsl #1] + + pop {r4-r12, pc} + +b_tm_pred + sub r10, r0, #1 ; Left + ldr r8, [r0, -r1] ; Above + ldrb r9, [r10, -r1] ; top_left + ldrb r4, [r0, #-1]! ; Left[0] + ldrb r5, [r10, r1]! ; Left[1] + ldrb r6, [r0, r1, lsl #1] ; Left[2] + ldrb r7, [r10, r1, lsl #1] ; Left[3] + ldr r0, [sp, #40] ; load stride + + + add r9, r9, r9, lsl #16 ; [tl|tl] + uxtb16 r10, r8 ; a[2|0] + uxtb16 r11, r8, ror #8 ; a[3|1] + ssub16 r10, r10, r9 ; a[2|0] - [tl|tl] + ssub16 r11, r11, r9 ; a[3|1] - [tl|tl] + + add r4, r4, r4, lsl #16 ; l[0|0] + add r5, r5, r5, lsl #16 ; l[1|1] + add r6, r6, r6, lsl #16 ; l[2|2] + add r7, r7, r7, lsl #16 ; l[3|3] + + sadd16 r1, r4, r10 ; l[0|0] + a[2|0] - [tl|tl] + sadd16 r2, r4, r11 ; l[0|0] + a[3|1] - [tl|tl] + usat16 r1, #8, r1 + usat16 r2, #8, r2 + + sadd16 r4, r5, r10 ; l[1|1] + a[2|0] - [tl|tl] + sadd16 r5, r5, r11 ; l[1|1] + a[3|1] - [tl|tl] + + add r12, r1, r2, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + usat16 r4, #8, r4 + usat16 r5, #8, r5 + + sadd16 r1, r6, r10 ; l[2|2] + a[2|0] - [tl|tl] + sadd16 r2, r6, r11 ; l[2|2] + a[3|1] - [tl|tl] + + add r12, r4, r5, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + usat16 r1, #8, r1 + usat16 r2, #8, r2 + + sadd16 r4, r7, r10 ; l[3|3] + a[2|0] - [tl|tl] + sadd16 r5, r7, r11 ; l[3|3] + a[3|1] - [tl|tl] + + add r12, r1, r2, lsl #8 ; [3|2|1|0] + + usat16 r4, #8, r4 + usat16 r5, #8, r5 + + str r12, [r3], r0 + + add r12, r4, r5, lsl #8 ; [3|2|1|0] + str r12, [r3], r0 + + pop {r4-r12, pc} + +b_ve_pred + ldr r8, [r0, -r1]! ; a[3|2|1|0] + ldr r11, c00FF00FF + ldrb r9, [r0, #-1] ; top_left + ldrb r10, [r0, #4] ; a[4] + + ldr r0, c00020002 + + uxtb16 r4, r8 ; a[2|0] + uxtb16 r5, r8, ror #8 ; a[3|1] + ldr r2, [sp, #40] ; stride + pkhbt r9, r9, r5, lsl #16 ; a[1|-1] + + add r9, r9, r4, lsl #1 ;[a[1]+2*a[2] | tl+2*a[0] ] + uxtab16 r9, r9, r5 ;[a[1]+2*a[2]+a[3] | tl+2*a[0]+a[1] ] + uxtab16 r9, r9, r0 ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2] + + add r0, r0, r10, lsl #16 ;[a[4]+2 | 2] + add r0, r0, r4, asr #16 ;[a[4]+2 | a[2]+2] + add r0, r0, r5, lsl #1 ;[a[4]+2*a[3]+2 | a[2]+2*a[1]+2] + uadd16 r4, r4, r0 ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2] + + and r9, r11, r9, asr #2 + and r4, r11, r4, asr #2 + add r3, r3, r2 ; dst + dst_stride + add r9, r9, r4, lsl #8 + + ; store values + str r9, [r3, -r2] + str r9, [r3] + str r9, [r3, r2] + str r9, [r3, r2, lsl #1] + + pop {r4-r12, pc} + + +b_he_pred + sub r10, r0, #1 ; Left + ldrb r4, [r0, #-1]! ; Left[0] + ldrb r8, [r10, -r1] ; top_left + ldrb r5, [r10, r1]! ; Left[1] + ldrb r6, [r0, r1, lsl #1] ; Left[2] + ldrb r7, [r10, r1, lsl #1] ; Left[3] + + add r8, r8, r4 ; tl + l[0] + add r9, r4, r5 ; l[0] + l[1] + add r10, r5, r6 ; l[1] + l[2] + add r11, r6, r7 ; l[2] + l[3] + + mov r0, #2<<14 + + add r8, r8, r9 ; tl + 2*l[0] + l[1] + add r4, r9, r10 ; l[0] + 2*l[1] + l[2] + add r5, r10, r11 ; l[1] + 2*l[2] + l[3] + add r6, r11, r7, lsl #1 ; l[2] + 2*l[3] + l[3] + + + add r8, r0, r8, lsl #14 ; (tl + 2*l[0] + l[1])>>2 in top half + add r9, r0, r4, lsl #14 ; (l[0] + 2*l[1] + l[2])>>2 in top half + add r10,r0, r5, lsl #14 ; (l[1] + 2*l[2] + l[3])>>2 in top half + add r11,r0, r6, lsl #14 ; (l[2] + 2*l[3] + l[3])>>2 in top half + + pkhtb r8, r8, r8, asr #16 ; l[-|0|-|0] + pkhtb r9, r9, r9, asr #16 ; l[-|1|-|1] + pkhtb r10, r10, r10, asr #16 ; l[-|2|-|2] + pkhtb r11, r11, r11, asr #16 ; l[-|3|-|3] + + ldr r0, [sp, #40] ; stride + + add r8, r8, r8, lsl #8 ; l[0|0|0|0] + add r9, r9, r9, lsl #8 ; l[1|1|1|1] + add r10, r10, r10, lsl #8 ; l[2|2|2|2] + add r11, r11, r11, lsl #8 ; l[3|3|3|3] + + ; store values + str r8, [r3], r0 + str r9, [r3] + str r10, [r3, r0] + str r11, [r3, r0, lsl #1] + + pop {r4-r12, pc} + +b_ld_pred + ldr r4, [r0, -r1]! ; Above + ldr r12, c00020002 + ldr r5, [r0, #4] + ldr lr, c00FF00FF + + uxtb16 r6, r4 ; a[2|0] + uxtb16 r7, r4, ror #8 ; a[3|1] + uxtb16 r8, r5 ; a[6|4] + uxtb16 r9, r5, ror #8 ; a[7|5] + pkhtb r10, r6, r8 ; a[2|4] + pkhtb r11, r7, r9 ; a[3|5] + + + add r4, r6, r7, lsl #1 ; [a2+2*a3 | a0+2*a1] + add r4, r4, r10, ror #16 ; [a2+2*a3+a4 | a0+2*a1+a2] + uxtab16 r4, r4, r12 ; [a2+2*a3+a4+2 | a0+2*a1+a2+2] + + add r5, r7, r10, ror #15 ; [a3+2*a4 | a1+2*a2] + add r5, r5, r11, ror #16 ; [a3+2*a4+a5 | a1+2*a2+a3] + uxtab16 r5, r5, r12 ; [a3+2*a4+a5+2 | a1+2*a2+a3+2] + + pkhtb r7, r9, r8, asr #16 + add r6, r8, r9, lsl #1 ; [a6+2*a7 | a4+2*a5] + uadd16 r6, r6, r7 ; [a6+2*a7+a7 | a4+2*a5+a6] + uxtab16 r6, r6, r12 ; [a6+2*a7+a7+2 | a4+2*a5+a6+2] + + uxth r7, r9 ; [ a5] + add r7, r7, r8, asr #15 ; [ a5+2*a6] + add r7, r7, r9, asr #16 ; [ a5+2*a6+a7] + uxtah r7, r7, r12 ; [ a5+2*a6+a7+2] + + ldr r0, [sp, #40] ; stride + + ; scale down + and r4, lr, r4, asr #2 + and r5, lr, r5, asr #2 + and r6, lr, r6, asr #2 + mov r7, r7, asr #2 + + add r8, r4, r5, lsl #8 ; [3|2|1|0] + str r8, [r3], r0 + + mov r9, r8, lsr #8 + add r9, r9, r6, lsl #24 ; [4|3|2|1] + str r9, [r3], r0 + + mov r10, r9, lsr #8 + add r10, r10, r7, lsl #24 ; [5|4|3|2] + str r10, [r3], r0 + + mov r6, r6, lsr #16 + mov r11, r10, lsr #8 + add r11, r11, r6, lsl #24 ; [6|5|4|3] + str r11, [r3], r0 + + pop {r4-r12, pc} + +b_rd_pred + sub r12, r0, r1 ; Above = src - src_stride + ldrb r7, [r0, #-1]! ; l[0] = pp[3] + ldr lr, [r12] ; Above = pp[8|7|6|5] + ldrb r8, [r12, #-1]! ; tl = pp[4] + ldrb r6, [r12, r1, lsl #1] ; l[1] = pp[2] + ldrb r5, [r0, r1, lsl #1] ; l[2] = pp[1] + ldrb r4, [r12, r1, lsl #2] ; l[3] = pp[0] + + + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + add r4, r4, r6, lsl #16 ; p[2|0] + add r5, r5, r7, lsl #16 ; p[3|1] + add r6, r6, r8, lsl #16 ; p[4|2] + pkhbt r7, r7, r9, lsl #16 ; p[5|3] + pkhbt r8, r8, r10, lsl #16 ; p[6|4] + + ldr r12, c00020002 + ldr lr, c00FF00FF + + add r4, r4, r5, lsl #1 ; [p2+2*p3 | p0+2*p1] + add r4, r4, r6 ; [p2+2*p3+p4 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2] + + add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r6, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r6, r6, r9 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r6, r6, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + add r7, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5] + add r7, r7, r10 ; [p6+2*p7+p8 | p4+2*p5+p6] + uxtab16 r7, r7, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2] + + ldr r0, [sp, #40] ; stride + + ; scale down + and r7, lr, r7, asr #2 + and r6, lr, r6, asr #2 + and r5, lr, r5, asr #2 + and r4, lr, r4, asr #2 + + add r8, r6, r7, lsl #8 ; [6|5|4|3] + str r8, [r3], r0 + + mov r9, r8, lsl #8 ; [5|4|3|-] + uxtab r9, r9, r4, ror #16 ; [5|4|3|2] + str r9, [r3], r0 + + mov r10, r9, lsl #8 ; [4|3|2|-] + uxtab r10, r10, r5 ; [4|3|2|1] + str r10, [r3], r0 + + mov r11, r10, lsl #8 ; [3|2|1|-] + uxtab r11, r11, r4 ; [3|2|1|0] + str r11, [r3], r0 + + pop {r4-r12, pc} + +b_vr_pred + sub r12, r0, r1 ; Above = src - src_stride + ldrb r7, [r0, #-1]! ; l[0] = pp[3] + ldr lr, [r12] ; Above = pp[8|7|6|5] + ldrb r8, [r12, #-1]! ; tl = pp[4] + ldrb r6, [r12, r1, lsl #1] ; l[1] = pp[2] + ldrb r5, [r0, r1, lsl #1] ; l[2] = pp[1] + ldrb r4, [r12, r1, lsl #2] ; l[3] = pp[0] + + add r5, r5, r7, lsl #16 ; p[3|1] + add r6, r6, r8, lsl #16 ; p[4|2] + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + pkhbt r7, r7, r9, lsl #16 ; p[5|3] + pkhbt r8, r8, r10, lsl #16 ; p[6|4] + + ldr r4, c00010001 + ldr r12, c00020002 + ldr lr, c00FF00FF + + add r5, r5, r6, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r5, r5, r7 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r6, r6, r7, lsl #1 ; [p4+2*p5 | p2+2*p3] + add r6, r6, r8 ; [p4+2*p5+p6 | p2+2*p3+p4] + uxtab16 r6, r6, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2] + + uadd16 r11, r8, r9 ; [p6+p7 | p4+p5] + uhadd16 r11, r11, r4 ; [(p6+p7+1)>>1 | (p4+p5+1)>>1] + ; [F|E] + + add r7, r7, r8, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r7, r7, r9 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r7, r7, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + uadd16 r2, r9, r10 ; [p7+p8 | p5+p6] + uhadd16 r2, r2, r4 ; [(p7+p8+1)>>1 | (p5+p6+1)>>1] + ; [J|I] + + add r8, r8, r9, lsl #1 ; [p6+2*p7 | p4+2*p5] + add r8, r8, r10 ; [p6+2*p7+p8 | p4+2*p5+p6] + uxtab16 r8, r8, r12 ; [p6+2*p7+p8+2 | p4+2*p5+p6+2] + + ldr r0, [sp, #40] ; stride + + ; scale down + and r5, lr, r5, asr #2 ; [B|A] + and r6, lr, r6, asr #2 ; [D|C] + and r7, lr, r7, asr #2 ; [H|G] + and r8, lr, r8, asr #2 ; [L|K] + + add r12, r11, r2, lsl #8 ; [J|F|I|E] + str r12, [r3], r0 + + add r12, r7, r8, lsl #8 ; [L|H|K|G] + str r12, [r3], r0 + + pkhbt r2, r6, r2, lsl #16 ; [-|I|-|C] + add r2, r2, r11, lsl #8 ; [F|I|E|C] + + pkhtb r12, r6, r5 ; [-|D|-|A] + pkhtb r10, r7, r5, asr #16 ; [-|H|-|B] + str r2, [r3], r0 + add r12, r12, r10, lsl #8 ; [H|D|B|A] + str r12, [r3], r0 + + pop {r4-r12, pc} + +b_vl_pred + ldr r4, [r0, -r1]! ; [3|2|1|0] + ldr r12, c00020002 + ldr r5, [r0, #4] ; [7|6|5|4] + ldr lr, c00FF00FF + ldr r2, c00010001 + + mov r0, r4, lsr #16 ; [-|-|3|2] + add r0, r0, r5, lsl #16 ; [5|4|3|2] + uxtb16 r6, r4 ; [2|0] + uxtb16 r7, r4, ror #8 ; [3|1] + uxtb16 r8, r0 ; [4|2] + uxtb16 r9, r0, ror #8 ; [5|3] + uxtb16 r10, r5 ; [6|4] + uxtb16 r11, r5, ror #8 ; [7|5] + + uadd16 r4, r6, r7 ; [p2+p3 | p0+p1] + uhadd16 r4, r4, r2 ; [(p2+p3+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r5, r6, r7, lsl #1 ; [p2+2*p3 | p0+2*p1] + add r5, r5, r8 ; [p2+2*p3+p4 | p0+2*p1+p2] + uxtab16 r5, r5, r12 ; [p2+2*p3+p4+2 | p0+2*p1+p2+2] + + uadd16 r6, r7, r8 ; [p3+p4 | p1+p2] + uhadd16 r6, r6, r2 ; [(p3+p4+1)>>1 | (p1+p2+1)>>1] + ; [F|E] + + add r7, r7, r8, lsl #1 ; [p3+2*p4 | p1+2*p2] + add r7, r7, r9 ; [p3+2*p4+p5 | p1+2*p2+p3] + uxtab16 r7, r7, r12 ; [p3+2*p4+p5+2 | p1+2*p2+p3+2] + + add r8, r8, r9, lsl #1 ; [p4+2*p5 | p2+2*p3] + add r8, r8, r10 ; [p4+2*p5+p6 | p2+2*p3+p4] + uxtab16 r8, r8, r12 ; [p4+2*p5+p6+2 | p2+2*p3+p4+2] + + add r9, r9, r10, lsl #1 ; [p5+2*p6 | p3+2*p4] + add r9, r9, r11 ; [p5+2*p6+p7 | p3+2*p4+p5] + uxtab16 r9, r9, r12 ; [p5+2*p6+p7+2 | p3+2*p4+p5+2] + + ldr r0, [sp, #40] ; stride + + ; scale down + and r5, lr, r5, asr #2 ; [D|C] + and r7, lr, r7, asr #2 ; [H|G] + and r8, lr, r8, asr #2 ; [I|D] + and r9, lr, r9, asr #2 ; [J|H] + + + add r10, r4, r6, lsl #8 ; [F|B|E|A] + str r10, [r3], r0 + + add r5, r5, r7, lsl #8 ; [H|C|G|D] + str r5, [r3], r0 + + pkhtb r12, r8, r4, asr #16 ; [-|I|-|B] + pkhtb r10, r9, r8 ; [-|J|-|D] + + add r12, r6, r12, lsl #8 ; [I|F|B|E] + str r12, [r3], r0 + + add r10, r7, r10, lsl #8 ; [J|H|D|G] + str r10, [r3], r0 + + pop {r4-r12, pc} + +b_hd_pred + sub r12, r0, r1 ; Above = src - src_stride + ldrb r7, [r0, #-1]! ; l[0] = pp[3] + ldr lr, [r12] ; Above = pp[8|7|6|5] + ldrb r8, [r12, #-1]! ; tl = pp[4] + ldrb r6, [r0, r1] ; l[1] = pp[2] + ldrb r5, [r0, r1, lsl #1] ; l[2] = pp[1] + ldrb r4, [r12, r1, lsl #2] ; l[3] = pp[0] + + uxtb16 r9, lr ; p[7|5] + uxtb16 r10, lr, ror #8 ; p[8|6] + + add r4, r4, r5, lsl #16 ; p[1|0] + add r5, r5, r6, lsl #16 ; p[2|1] + add r6, r6, r7, lsl #16 ; p[3|2] + add r7, r7, r8, lsl #16 ; p[4|3] + + ldr r12, c00020002 + ldr lr, c00FF00FF + ldr r2, c00010001 + + pkhtb r8, r7, r9 ; p[4|5] + pkhtb r1, r9, r10 ; p[7|6] + pkhbt r10, r8, r10, lsl #16 ; p[6|5] + + + uadd16 r11, r4, r5 ; [p1+p2 | p0+p1] + uhadd16 r11, r11, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1] + add r4, r4, r6 ; [p1+2*p2+p3 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2] + + uadd16 r0, r6, r7 ; [p3+p4 | p2+p3] + uhadd16 r0, r0, r2 ; [(p3+p4+1)>>1 | (p2+p3+1)>>1] + ; [F|E] + + add r5, r6, r7, lsl #1 ; [p3+2*p4 | p2+2*p3] + add r5, r5, r8, ror #16 ; [p3+2*p4+p5 | p2+2*p3+p4] + uxtab16 r5, r5, r12 ; [p3+2*p4+p5+2 | p2+2*p3+p4+2] + + add r6, r12, r8, ror #16 ; [p5+2 | p4+2] + add r6, r6, r10, lsl #1 ; [p5+2+2*p6 | p4+2+2*p5] + uxtab16 r6, r6, r1 ; [p5+2+2*p6+p7 | p4+2+2*p5+p6] + + ; scale down + and r4, lr, r4, asr #2 ; [D|C] + and r5, lr, r5, asr #2 ; [H|G] + and r6, lr, r6, asr #2 ; [J|I] + + ldr lr, [sp, #40] ; stride + + pkhtb r2, r0, r6 ; [-|F|-|I] + pkhtb r12, r6, r5, asr #16 ; [-|J|-|H] + add r12, r12, r2, lsl #8 ; [F|J|I|H] + add r2, r0, r5, lsl #8 ; [H|F|G|E] + mov r12, r12, ror #24 ; [J|I|H|F] + str r12, [r3], lr + + + mov r7, r11, asr #16 ; [-|-|-|B] + str r2, [r3], lr + add r7, r7, r0, lsl #16 ; [-|E|-|B] + add r7, r7, r4, asr #8 ; [-|E|D|B] + add r7, r7, r5, lsl #24 ; [G|E|D|B] + str r7, [r3], lr + + add r5, r11, r4, lsl #8 ; [D|B|C|A] + str r5, [r3], lr + + pop {r4-r12, pc} + + + +b_hu_pred + ldrb r4, [r0, #-1]! ; Left[0] + ldr r12, c00020002 + ldrb r5, [r0, r1]! ; Left[1] + ldr lr, c00FF00FF + ldrb r6, [r0, r1]! ; Left[2] + ldr r2, c00010001 + ldrb r7, [r0, r1] ; Left[3] + + + add r4, r4, r5, lsl #16 ; [1|0] + add r5, r5, r6, lsl #16 ; [2|1] + add r9, r6, r7, lsl #16 ; [3|2] + + uadd16 r8, r4, r5 ; [p1+p2 | p0+p1] + uhadd16 r8, r8, r2 ; [(p1+p2+1)>>1 | (p0+p1+1)>>1] + ; [B|A] + + add r4, r4, r5, lsl #1 ; [p1+2*p2 | p0+2*p1] + add r4, r4, r9 ; [p1+2*p2+p3 | p0+2*p1+p2] + uxtab16 r4, r4, r12 ; [p1+2*p2+p3+2 | p0+2*p1+p2+2] + ldr r2, [sp, #40] ; stride + and r4, lr, r4, asr #2 ; [D|C] + + add r10, r6, r7 ; [p2+p3] + add r11, r10, r7, lsl #1 ; [p2+3*p3] + add r10, r10, #1 + add r11, r11, #2 + mov r10, r10, asr #1 ; [E] + mov r11, r11, asr #2 ; [F] + + add r9, r7, r9, asr #8 ; [-|-|G|G] + add r0, r8, r4, lsl #8 ; [D|B|C|A] + add r7, r9, r9, lsl #16 ; [G|G|G|G] + + str r0, [r3], r2 + + mov r1, r8, asr #16 ; [-|-|-|B] + add r1, r1, r4, asr #8 ; [-|-|D|B] + add r1, r1, r10, lsl #16 ; [-|E|D|B] + add r1, r1, r11, lsl #24 ; [F|E|D|B] + str r1, [r3], r2 + + add r10, r11, lsl #8 ; [-|-|F|E] + add r10, r10, r9, lsl #16 ; [G|G|F|E] + str r10, [r3] + + str r7, [r3, r2] + + pop {r4-r12, pc} + + ENDP + +; constants +c00010001 + DCD 0x00010001 +c00020002 + DCD 0x00020002 +c00FF00FF + DCD 0x00FF00FF + + END diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h index dec7fc4..f8ef56a 100644 --- a/vp8/common/arm/recon_arm.h +++ b/vp8/common/arm/recon_arm.h @@ -17,6 +17,7 @@ extern prototype_copy_block(vp8_copy_mem8x8_v6); extern prototype_copy_block(vp8_copy_mem8x4_v6); extern prototype_copy_block(vp8_copy_mem16x16_v6); +extern prototype_intra4x4_predict(vp8_intra4x4_predict_armv6); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_recon_copy8x8 @@ -27,6 +28,9 @@ extern prototype_copy_block(vp8_copy_mem16x16_v6); #undef vp8_recon_copy16x16 #define vp8_recon_copy16x16 vp8_copy_mem16x16_v6 + +#undef vp8_recon_intra4x4_predict +#define vp8_recon_intra4x4_predict vp8_intra4x4_predict_armv6 #endif #endif diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c index e135a4d..704a3f0 100644 --- a/vp8/common/asm_com_offsets.c +++ b/vp8/common/asm_com_offsets.c @@ -13,6 +13,7 @@ #include "vpx/vpx_codec.h" #include "vpx_ports/asm_offsets.h" #include "vpx_scale/yv12config.h" +#include "vp8/common/blockd.h" BEGIN @@ -34,6 +35,20 @@ END /* add asserts for any offset that is not supported by assembly code */ /* add asserts for any size that is not supported by assembly code */ +#if HAVE_ARMV6 +/* switch case in vp8_intra4x4_predict_armv6 is based on these enumerated values */ +ct_assert(B_DC_PRED, B_DC_PRED == 0); +ct_assert(B_TM_PRED, B_TM_PRED == 1); +ct_assert(B_VE_PRED, B_VE_PRED == 2); +ct_assert(B_HE_PRED, B_HE_PRED == 3); +ct_assert(B_LD_PRED, B_LD_PRED == 4); +ct_assert(B_RD_PRED, B_RD_PRED == 5); +ct_assert(B_VR_PRED, B_VR_PRED == 6); +ct_assert(B_VL_PRED, B_VL_PRED == 7); +ct_assert(B_HD_PRED, B_HD_PRED == 8); +ct_assert(B_HU_PRED, B_HU_PRED == 9); +#endif + #if HAVE_ARMV7 /* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */ ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32) diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 5c4fbb1..9641d8c 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -88,7 +88,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->recon.build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s; rtcd->recon.intra4x4_predict = - vp8_intra4x4_predict; + vp8_intra4x4_predict_c; rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; diff --git a/vp8/common/recon.h b/vp8/common/recon.h index 62bd71a..98c5f63 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -27,7 +27,8 @@ void sym(MACROBLOCKD *x) #define prototype_intra4x4_predict(sym) \ - void sym(BLOCKD *x, int b_mode, unsigned char *predictor, int stride) + void sym(unsigned char *src, int src_stride, int b_mode, \ + unsigned char *dst, int dst_stride) struct vp8_recon_rtcd_vtable; @@ -79,7 +80,7 @@ extern prototype_build_intra_predictors\ (vp8_recon_build_intra_predictors_mbuv_s); #ifndef vp8_recon_intra4x4_predict -#define vp8_recon_intra4x4_predict vp8_intra4x4_predict +#define vp8_recon_intra4x4_predict vp8_intra4x4_predict_c #endif extern prototype_intra4x4_predict\ (vp8_recon_intra4x4_predict); diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c index 12430da..64dac5d 100644 --- a/vp8/common/reconintra4x4.c +++ b/vp8/common/reconintra4x4.c @@ -9,25 +9,22 @@ */ -#include "vpx_config.h" #include "recon.h" -#include "vpx_mem/vpx_mem.h" -#include "reconintra.h" -void vp8_intra4x4_predict(BLOCKD *x, - int b_mode, - unsigned char *predictor, int stride) +void vp8_intra4x4_predict_c(unsigned char *src, int src_stride, + int b_mode, + unsigned char *dst, int dst_stride) { int i, r, c; - unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride; + unsigned char *Above = src - src_stride; unsigned char Left[4]; unsigned char top_left = Above[-1]; - Left[0] = (*(x->base_dst))[x->dst - 1]; - Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; - Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; - Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; + Left[0] = src[-1]; + Left[1] = src[-1 + src_stride]; + Left[2] = src[-1 + 2 * src_stride]; + Left[3] = src[-1 + 3 * src_stride]; switch (b_mode) { @@ -47,10 +44,10 @@ void vp8_intra4x4_predict(BLOCKD *x, { for (c = 0; c < 4; c++) { - predictor[c] = expected_dc; + dst[c] = expected_dc; } - predictor += stride; + dst += dst_stride; } } break; @@ -69,10 +66,10 @@ void vp8_intra4x4_predict(BLOCKD *x, if (pred > 255) pred = 255; - predictor[c] = pred; + dst[c] = pred; } - predictor += stride; + dst += dst_stride; } } break; @@ -91,10 +88,10 @@ void vp8_intra4x4_predict(BLOCKD *x, for (c = 0; c < 4; c++) { - predictor[c] = ap[c]; + dst[c] = ap[c]; } - predictor += stride; + dst += dst_stride; } } @@ -114,32 +111,32 @@ void vp8_intra4x4_predict(BLOCKD *x, { for (c = 0; c < 4; c++) { - predictor[c] = lp[r]; + dst[c] = lp[r]; } - predictor += stride; + dst += dst_stride; } } break; case B_LD_PRED: { unsigned char *ptr = Above; - predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; - predictor[0 * stride + 1] = - predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; - predictor[0 * stride + 2] = - predictor[1 * stride + 1] = - predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; - predictor[0 * stride + 3] = - predictor[1 * stride + 2] = - predictor[2 * stride + 1] = - predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; - predictor[1 * stride + 3] = - predictor[2 * stride + 2] = - predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; - predictor[2 * stride + 3] = - predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; - predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + dst[0 * dst_stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + dst[0 * dst_stride + 1] = + dst[1 * dst_stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + dst[0 * dst_stride + 2] = + dst[1 * dst_stride + 1] = + dst[2 * dst_stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + dst[0 * dst_stride + 3] = + dst[1 * dst_stride + 2] = + dst[2 * dst_stride + 1] = + dst[3 * dst_stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + dst[1 * dst_stride + 3] = + dst[2 * dst_stride + 2] = + dst[3 * dst_stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + dst[2 * dst_stride + 3] = + dst[3 * dst_stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + dst[3 * dst_stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; } break; @@ -158,22 +155,22 @@ void vp8_intra4x4_predict(BLOCKD *x, pp[7] = Above[2]; pp[8] = Above[3]; - predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[3 * stride + 1] = - predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[3 * stride + 2] = - predictor[2 * stride + 1] = - predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * stride + 3] = - predictor[2 * stride + 2] = - predictor[1 * stride + 1] = - predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * stride + 3] = - predictor[1 * stride + 2] = - predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[1 * stride + 3] = - predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + dst[3 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[3 * dst_stride + 1] = + dst[2 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[3 * dst_stride + 2] = + dst[2 * dst_stride + 1] = + dst[1 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[3 * dst_stride + 3] = + dst[2 * dst_stride + 2] = + dst[1 * dst_stride + 1] = + dst[0 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[2 * dst_stride + 3] = + dst[1 * dst_stride + 2] = + dst[0 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[1 * dst_stride + 3] = + dst[0 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + dst[0 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; } break; @@ -193,22 +190,22 @@ void vp8_intra4x4_predict(BLOCKD *x, pp[8] = Above[3]; - predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * stride + 1] = - predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * stride + 1] = - predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1; - predictor[3 * stride + 2] = - predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[2 * stride + 2] = - predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1; - predictor[3 * stride + 3] = - predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[2 * stride + 3] = - predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1; - predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; - predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1; + dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[3 * dst_stride + 1] = + dst[1 * dst_stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[2 * dst_stride + 1] = + dst[0 * dst_stride + 0] = (pp[4] + pp[5] + 1) >> 1; + dst[3 * dst_stride + 2] = + dst[1 * dst_stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[2 * dst_stride + 2] = + dst[0 * dst_stride + 1] = (pp[5] + pp[6] + 1) >> 1; + dst[3 * dst_stride + 3] = + dst[1 * dst_stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + dst[2 * dst_stride + 3] = + dst[0 * dst_stride + 2] = (pp[6] + pp[7] + 1) >> 1; + dst[1 * dst_stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + dst[0 * dst_stride + 3] = (pp[7] + pp[8] + 1) >> 1; } break; @@ -217,22 +214,22 @@ void vp8_intra4x4_predict(BLOCKD *x, unsigned char *pp = Above; - predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * stride + 0] = - predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1; - predictor[1 * stride + 1] = - predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * stride + 1] = - predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1; - predictor[3 * stride + 1] = - predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[0 * stride + 3] = - predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * stride + 3] = - predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1; + dst[1 * dst_stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[2 * dst_stride + 0] = + dst[0 * dst_stride + 1] = (pp[1] + pp[2] + 1) >> 1; + dst[1 * dst_stride + 1] = + dst[3 * dst_stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 1] = + dst[0 * dst_stride + 2] = (pp[2] + pp[3] + 1) >> 1; + dst[3 * dst_stride + 1] = + dst[1 * dst_stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[0 * dst_stride + 3] = + dst[2 * dst_stride + 2] = (pp[3] + pp[4] + 1) >> 1; + dst[1 * dst_stride + 3] = + dst[3 * dst_stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[2 * dst_stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[3 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; } break; @@ -250,22 +247,22 @@ void vp8_intra4x4_predict(BLOCKD *x, pp[8] = Above[3]; - predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * stride + 0] = - predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1; - predictor[2 * stride + 1] = - predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * stride + 2] = - predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[2 * stride + 3] = - predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[1 * stride + 2] = - predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * stride + 3] = - predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + dst[3 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1; + dst[3 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[2 * dst_stride + 0] = + dst[3 * dst_stride + 2] = (pp[1] + pp[2] + 1) >> 1; + dst[2 * dst_stride + 1] = + dst[3 * dst_stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 2] = + dst[1 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1; + dst[2 * dst_stride + 3] = + dst[1 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + dst[1 * dst_stride + 2] = + dst[0 * dst_stride + 0] = (pp[3] + pp[4] + 1) >> 1; + dst[1 * dst_stride + 3] = + dst[0 * dst_stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + dst[0 * dst_stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + dst[0 * dst_stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; } break; @@ -273,22 +270,22 @@ void vp8_intra4x4_predict(BLOCKD *x, case B_HU_PRED: { unsigned char *pp = Left; - predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[0 * stride + 2] = - predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1; - predictor[0 * stride + 3] = - predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[1 * stride + 2] = - predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[1 * stride + 3] = - predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; - predictor[2 * stride + 2] = - predictor[2 * stride + 3] = - predictor[3 * stride + 0] = - predictor[3 * stride + 1] = - predictor[3 * stride + 2] = - predictor[3 * stride + 3] = pp[3]; + dst[0 * dst_stride + 0] = (pp[0] + pp[1] + 1) >> 1; + dst[0 * dst_stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + dst[0 * dst_stride + 2] = + dst[1 * dst_stride + 0] = (pp[1] + pp[2] + 1) >> 1; + dst[0 * dst_stride + 3] = + dst[1 * dst_stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + dst[1 * dst_stride + 2] = + dst[2 * dst_stride + 0] = (pp[2] + pp[3] + 1) >> 1; + dst[1 * dst_stride + 3] = + dst[2 * dst_stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + dst[2 * dst_stride + 2] = + dst[2 * dst_stride + 3] = + dst[3 * dst_stride + 0] = + dst[3 * dst_stride + 1] = + dst[3 * dst_stride + 2] = + dst[3 * dst_stride + 3] = pp[3]; } break; @@ -316,5 +313,3 @@ void vp8_intra_prediction_down_copy(MACROBLOCKD *x) *dst_ptr1 = *src_ptr; *dst_ptr2 = *src_ptr; } - - diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index f44c7d3..e501b9e 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -211,7 +211,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int b_mode = xd->mode_info_context->bmi[i].as_mode; RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict) - (b, b_mode, *(b->base_dst) + b->dst, b->dst_stride); + ( *(b->base_dst) + b->dst, b->dst_stride, b_mode, + *(b->base_dst) + b->dst, b->dst_stride ); if (xd->eobs[i] ) { diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 1c07cbd..46bf650 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -64,7 +64,8 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, BLOCK *be = &x->block[ib]; RECON_INVOKE(&rtcd->common->recon, intra4x4_predict) - (b, b->bmi.as_mode, b->predictor, 16); + (*(b->base_dst) + b->dst, b->dst_stride, + b->bmi.as_mode, b->predictor, 16); ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 62e644d..4d87341 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -157,7 +157,8 @@ static int pick_intra4x4block( rate = mode_costs[mode]; RECON_INVOKE(&rtcd->common->recon, intra4x4_predict) - (b, mode, b->predictor, 16); + (*(b->base_dst) + b->dst, b->dst_stride, + mode, b->predictor, 16); distortion = get_prediction_error(be, b, &rtcd->variance); this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 7950960..f966b06 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -631,7 +631,8 @@ static int rd_pick_intra4x4block( rate = bmode_costs[mode]; RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict) - (b, mode, b->predictor, 16); + (*(b->base_dst) + b->dst, b->dst_stride, + mode, b->predictor, 16); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16); x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b(be, b); diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 85b8113..5c15a3e 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -126,6 +126,7 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM) +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/intra4x4_predict_v6$(ASM) # common (neon) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict4x4_neon$(ASM) -- 2.7.4