Merge "NEON FDCT updated to match current C code"
authorJohann <johannkoenig@google.com>
Tue, 20 Sep 2011 16:51:05 +0000 (09:51 -0700)
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>
Tue, 20 Sep 2011 16:51:05 +0000 (09:51 -0700)
14 files changed:
vp8/decoder/decodframe.c
vp8/decoder/onyxd_if.c
vp8/decoder/threading.c
vp8/encoder/arm/arm_csystemdependent.c
vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm [moved from vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm with 99% similarity]
vp8/encoder/arm/armv6/walsh_v6.asm
vp8/encoder/arm/dct_arm.c
vp8/encoder/arm/dct_arm.h
vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
vp8/vp8cx_arm.mk

index 1c11b0b..4cd370a 100644 (file)
@@ -191,7 +191,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     {
         vp8_reset_mb_tokens_context(xd);
     }
-    else
+    else if (!vp8dx_bool_error(xd->current_bc))
     {
         eobtotal = vp8_decode_mb_tokens(pbi, xd);
     }
@@ -236,7 +236,6 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     {
         vp8_build_inter_predictors_mb(xd);
     }
-
     /* When we have independent partitions we can apply residual even
      * though other partitions within the frame are corrupt.
      */
@@ -471,9 +470,16 @@ static void setup_token_decoder_partition_input(VP8D_COMP *pbi)
 {
     vp8_reader *bool_decoder = &pbi->bc2;
     int part_idx = 1;
+    int num_token_partitions;
 
     TOKEN_PARTITION multi_token_partition =
             (TOKEN_PARTITION)vp8_read_literal(&pbi->bc, 2);
+    if (!vp8dx_bool_error(&pbi->bc))
+        pbi->common.multi_token_partition = multi_token_partition;
+    num_token_partitions = 1 << pbi->common.multi_token_partition;
+    if (num_token_partitions + 1 > pbi->num_partitions)
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                           "Partitions missing");
     assert(vp8dx_bool_error(&pbi->bc) ||
            multi_token_partition == pbi->common.multi_token_partition);
     if (pbi->num_partitions > 2)
@@ -734,12 +740,14 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         pc->show_frame = (data[0] >> 4) & 1;
         first_partition_length_in_bytes =
             (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;
-        data += 3;
 
         if (!pbi->ec_active && (data + first_partition_length_in_bytes > data_end
             || data + first_partition_length_in_bytes < data))
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                                "Truncated packet or corrupt partition 0 length");
+
+        data += 3;
+
         vp8_setup_version(pc);
 
         if (pc->frame_type == KEY_FRAME)
@@ -812,7 +820,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         }
     }
 
-    if (pc->Width == 0 || pc->Height == 0)
+    if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||
+        pc->Width == 0 || pc->Height == 0)
     {
         return -1;
     }
index db6528c..357684a 100644 (file)
@@ -324,16 +324,16 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
         /* Store a pointer to this partition and return. We haven't
          * received the complete frame yet, so we will wait with decoding.
          */
+        assert(pbi->num_partitions < MAX_PARTITIONS);
         pbi->partitions[pbi->num_partitions] = source;
         pbi->partition_sizes[pbi->num_partitions] = size;
         pbi->source_sz += size;
         pbi->num_partitions++;
-        if (pbi->num_partitions > (1<<pbi->common.multi_token_partition) + 1)
-            pbi->common.multi_token_partition++;
-        if (pbi->common.multi_token_partition > EIGHT_PARTITION)
+        if (pbi->num_partitions > (1 << EIGHT_PARTITION) + 1)
         {
             pbi->common.error.error_code = VPX_CODEC_UNSUP_BITSTREAM;
             pbi->common.error.setjmp = 0;
+            pbi->num_partitions = 0;
             return -1;
         }
         return 0;
@@ -345,6 +345,25 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
             pbi->Source = source;
             pbi->source_sz = size;
         }
+        else
+        {
+            assert(pbi->common.multi_token_partition <= EIGHT_PARTITION);
+            if (pbi->num_partitions == 0)
+            {
+                pbi->num_partitions = 1;
+                pbi->partitions[0] = NULL;
+                pbi->partition_sizes[0] = 0;
+            }
+            while (pbi->num_partitions < (1 << pbi->common.multi_token_partition) + 1)
+            {
+                // Reset all missing partitions
+                pbi->partitions[pbi->num_partitions] =
+                    pbi->partitions[pbi->num_partitions - 1] +
+                    pbi->partition_sizes[pbi->num_partitions - 1];
+                pbi->partition_sizes[pbi->num_partitions] = 0;
+                pbi->num_partitions++;
+            }
+        }
 
         if (pbi->source_sz == 0)
         {
@@ -364,8 +383,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
                 cm->show_frame = 0;
 
                 pbi->num_partitions = 0;
-                if (pbi->input_partition)
-                    pbi->common.multi_token_partition = 0;
 
                 /* Nothing more to do. */
                 return 0;
@@ -396,8 +413,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
             pbi->common.error.setjmp = 0;
 
             pbi->num_partitions = 0;
-            if (pbi->input_partition)
-                pbi->common.multi_token_partition = 0;
 
            /* We do not know if the missing frame(s) was supposed to update
             * any of the reference buffers, but we act conservative and
@@ -427,6 +442,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 #endif
         pbi->common.error.error_code = VPX_CODEC_ERROR;
         pbi->common.error.setjmp = 0;
+        pbi->num_partitions = 0;
         if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
           cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
         return retcode;
@@ -447,6 +463,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 #endif
             pbi->common.error.error_code = VPX_CODEC_ERROR;
             pbi->common.error.setjmp = 0;
+            pbi->num_partitions = 0;
             return -1;
         }
     } else
@@ -464,6 +481,7 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
 #endif
             pbi->common.error.error_code = VPX_CODEC_ERROR;
             pbi->common.error.setjmp = 0;
+            pbi->num_partitions = 0;
             return -1;
         }
 
@@ -508,8 +526,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign
     pbi->ready_for_new_data = 0;
     pbi->last_time_stamp = time_stamp;
     pbi->num_partitions = 0;
-    if (pbi->input_partition)
-        pbi->common.multi_token_partition = 0;
     pbi->source_sz = 0;
 
 #if 0
index a8bd087..bfc6007 100644 (file)
@@ -104,7 +104,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
     {
         vp8_reset_mb_tokens_context(xd);
     }
-    else
+    else if (!vp8dx_bool_error(xd->current_bc))
     {
         eobtotal = vp8_decode_mb_tokens(pbi, xd);
     }
@@ -169,7 +169,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
 #if CONFIG_ERROR_CONCEALMENT
     if (pbi->ec_active &&
         (mb_row * pbi->common.mb_cols + mb_col >= pbi->mvs_corrupt_from_mb ||
-        throw_residual))
+         throw_residual))
     {
         /* MB with corrupt residuals or corrupt mode/motion vectors.
          * Better to use the predictor as reconstruction.
index 36791e4..081775b 100644 (file)
@@ -58,10 +58,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
 
         /*cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
 
-        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_armv6;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_armv6;
+        cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_armv6;
+        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_armv6;
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
 
         /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
index 3c05f57..138ed46 100644 (file)
     sub     r7, r5, #1                  ; range-1
 
     cmp     r1, #0
-    mul     r4, r4, r7                  ; ((range-1) * probability)
+    mul     r6, r4, r7                  ; ((range-1) * probability)
 
     mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * probability) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
 
     addne   r2, r2, r4                  ; if  (bit) lowvalue += split
     subne   r4, r5, r4                  ; if  (bit) range = range-split
index d939287..933717c 100644 (file)
@@ -71,7 +71,7 @@ token_loop
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
     lsls    r12, r12, #1                ; bb = v >> n
-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
     ; if bb == 1, otherwise it will act like i + 0
@@ -79,7 +79,7 @@ token_loop
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -172,12 +172,12 @@ extra_bits_loop
     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
     lsls    r12, r12, #1                ; v >> n
-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
index ac2bba6..82bf71f 100644 (file)
@@ -93,7 +93,7 @@ token_loop
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
     lsls    r12, r12, #1                ; bb = v >> n
-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
     ; if bb == 1, otherwise it will act like i + 0
@@ -101,7 +101,7 @@ token_loop
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -194,12 +194,12 @@ extra_bits_loop
     ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
     lsls    r12, r12, #1                ; v >> n
-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
index c2eccdb..c00375e 100644 (file)
@@ -123,7 +123,7 @@ token_loop
     ; off of v, so set a flag here based on this.
     ; This value is refered to as "bb"
     lsls    r12, r12, #1                ; bb = v >> n
-    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
 
     ; bb can only be 0 or 1.  So only execute this statement
     ; if bb == 1, otherwise it will act like i + 0
@@ -131,7 +131,7 @@ token_loop
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -224,12 +224,12 @@ extra_bits_loop
     ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
     sub     r7, r5, #1                  ; range-1
     lsls    r12, r12, #1                ; v >> n
-    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
     addcs   lr, lr, #1                  ; i + bb
 
     mov     r7, #1
     ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
 
     addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
     subcs   r4, r5, r4                  ; if  (bb) range = range-split
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT |vp8_fast_fdct4x4_armv6|
+    EXPORT |vp8_short_fdct4x4_armv6|
 
     ARM
     REQUIRE8
@@ -16,7 +16,7 @@
 
     AREA    |.text|, CODE, READONLY
 ; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
-|vp8_fast_fdct4x4_armv6| PROC
+|vp8_short_fdct4x4_armv6| PROC
 
     stmfd       sp!, {r4 - r12, lr}
 
index 61ffdb3..5eaf3f2 100644 (file)
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
 ;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+; r0    short *input,
+; r1    short *output,
+; r2    int pitch
 |vp8_short_walsh4x4_armv6| PROC
 
     stmdb       sp!, {r4 - r11, lr}
 
-    mov         r12, r2              ; ugh. not clean
-    ldr         r2, [r0]             ; [1  |  0]
-    ldr         r3, [r0, #4]         ; [3  |  2]
-    ldr         r4, [r0, r12]!       ; [5  |  4]
-    ldr         r5, [r0, #4]         ; [7  |  6]
-    ldr         r6, [r0, r12]!       ; [9  |  8]
-    ldr         r7, [r0, #4]         ; [11 | 10]
-    ldr         r8, [r0, r12]!       ; [13 | 12]
-    ldr         r9, [r0, #4]         ; [15 | 14]
-
-    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
-    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
-    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
-    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
-
-    qaddsubx    r2, r10, r11         ; [1 | 2] [c1+d1 | a1-b1]
-    qaddsubx    r3, r11, r10         ; [0 | 3] [b1+a1 | d1-c1]
-    qaddsubx    r4, r12, lr          ; [5 | 6] [c1+d1 | a1-b1]
-    qaddsubx    r5, lr, r12          ; [4 | 7] [b1+a1 | d1-c1]
-
-    qsubaddx    r10, r6, r7          ; [c1|a1] [9-10  |  8+11]
-    qaddsubx    r11, r6, r7          ; [b1|d1] [9+10  |  8-11]
-    qsubaddx    r12, r8, r9          ; [c1|a1] [13-14 | 12+15]
-    qaddsubx    lr, r8, r9           ; [b1|d1] [13+14 | 12-15]
-
-    qaddsubx    r6, r10, r11         ; [9 |10] [c1+d1 | a1-b1]
-    qaddsubx    r7, r11, r10         ; [8 |11] [b1+a1 | d1-c1]
-    qaddsubx    r8, r12, lr          ; [13|14] [c1+d1 | a1-b1]
-    qaddsubx    r9, lr, r12          ; [12|15] [b1+a1 | d1-c1]
-
-    ; first transform complete
-
-    qadd16      r10, r3, r9          ; a1 [0+12  |  3+15]
-    qadd16      r11, r5, r7          ; b1 [4+8   |  7+11]
-    qsub16      r12, r5, r7          ; c1 [4-8   |  7-11]
-    qsub16      lr, r3, r9           ; d1 [0-12  |  3-15]
-
-    qadd16      r3, r10, r11         ; a2 [a1+b1] [0 | 3]
-    qadd16      r5, r12, lr          ; b2 [c1+d1] [4 | 7]
-    qsub16      r7, r10, r11         ; c2 [a1-b1] [8 |11]
-    qsub16      r9, lr, r12          ; d2 [d1-c1] [12|15]
-
-    qadd16      r10, r2, r8          ; a1 [1+13  |  2+14]
-    qadd16      r11, r4, r6          ; b1 [5+9   |  6+10]
-    qsub16      r12, r4, r6          ; c1 [5-9   |  6-10]
-    qsub16      lr, r2, r8           ; d1 [1-13  |  2-14]
-
-    qadd16      r2, r10, r11         ; a2 [a1+b1] [1 | 2]
-    qadd16      r4, r12, lr          ; b2 [c1+d1] [5 | 6]
-    qsub16      r6, r10, r11         ; c2 [a1-b1] [9 |10]
-    qsub16      r8, lr, r12          ; d2 [d1-c1] [13|14]
-
-    ; [a-d]2 += ([a-d]2 > 0)
-
-    asrs        r10, r3, #16
-    addpl       r10, r10, #1         ; [~0]
-    asrs        r11, r2, #16
-    addpl       r11, r11, #1         ; [~1]
-    lsl         r11, r11, #15        ; [1  |  x]
-    pkhtb       r10, r11, r10, asr #1; [1  |  0]
-    str         r10, [r1], #4
-
-    lsls        r11, r2, #16
-    addpl       r11, r11, #0x10000   ; [~2]
-    lsls        r12, r3, #16
-    addpl       r12, r12, #0x10000   ; [~3]
-    asr         r12, r12, #1         ; [3  |  x]
-    pkhtb       r11, r12, r11, asr #17; [3  |  2]
-    str         r11, [r1], #4
-
-    asrs        r2, r5, #16
-    addpl       r2, r2, #1           ; [~4]
-    asrs        r3, r4, #16
-    addpl       r3, r3, #1           ; [~5]
-    lsl         r3, r3, #15          ; [5  |  x]
-    pkhtb       r2, r3, r2, asr #1   ; [5  |  4]
-    str         r2, [r1], #4
-
-    lsls        r2, r4, #16
-    addpl       r2, r2, #0x10000     ; [~6]
-    lsls        r3, r5, #16
-    addpl       r3, r3, #0x10000     ; [~7]
-    asr         r3, r3, #1           ; [7  |  x]
-    pkhtb       r2, r3, r2, asr #17  ; [7  |  6]
-    str         r2, [r1], #4
-
-    asrs        r2, r7, #16
-    addpl       r2, r2, #1           ; [~8]
-    asrs        r3, r6, #16
-    addpl       r3, r3, #1           ; [~9]
-    lsl         r3, r3, #15          ; [9  |  x]
-    pkhtb       r2, r3, r2, asr #1   ; [9  |  8]
-    str         r2, [r1], #4
-
-    lsls        r2, r6, #16
-    addpl       r2, r2, #0x10000     ; [~10]
-    lsls        r3, r7, #16
-    addpl       r3, r3, #0x10000     ; [~11]
-    asr         r3, r3, #1           ; [11 |  x]
-    pkhtb       r2, r3, r2, asr #17  ; [11 | 10]
-    str         r2, [r1], #4
-
-    asrs        r2, r9, #16
-    addpl       r2, r2, #1           ; [~12]
-    asrs        r3, r8, #16
-    addpl       r3, r3, #1           ; [~13]
-    lsl         r3, r3, #15          ; [13 |  x]
-    pkhtb       r2, r3, r2, asr #1   ; [13 | 12]
-    str         r2, [r1], #4
-
-    lsls        r2, r8, #16
-    addpl       r2, r2, #0x10000     ; [~14]
-    lsls        r3, r9, #16
-    addpl       r3, r3, #0x10000     ; [~15]
-    asr         r3, r3, #1           ; [15 |  x]
-    pkhtb       r2, r3, r2, asr #17  ; [15 | 14]
-    str         r2, [r1]
+    ldrd        r4, r5, [r0], r2
+    ldr         lr, c00040004
+    ldrd        r6, r7, [r0], r2
+
+    ; 0-3
+    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
+    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
+
+    ldrd        r8, r9, [r0], r2
+    ; 4-7
+    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
+    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
+
+    ldrd        r10, r11, [r0]
+    ; 8-11
+    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
+    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
+
+    ; 12-15
+    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
+    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
+
+
+    lsls        r2, r3, #16
+    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
+    addne       r11, r11, #1        ; A0 += (a1!=0)
+
+    lsls        r2, r7, #16
+    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; C0 += (a1!=0)
+
+    add         r0, r11, r12        ; a1_0 = A0 + C0
+    sub         r11, r11, r12       ; b1_0 = A0 - C0
+
+    lsls        r2, r5, #16
+    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
+    addne       r12, r12, #1        ; B0 += (a1!=0)
+
+    lsls        r2, r9, #16
+    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
+    addne       r2, r2, #1          ; D0 += (a1!=0)
+
+    add         lr, r12, r2         ; d1_0 = B0 + D0
+    sub         r12, r12, r2        ; c1_0 = B0 - D0
+
+    ; op[0,4,8,12]
+    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1]            ; op[0]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    ldr         lr, c00040004
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #24]       ; op[12]
+
+    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #8]        ; op[4]
+
+    addmi       r0, r0, #1          ; += a2 < 0
+    add         r0, r0, #3          ; += 3
+    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
+    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
+    mov         r0, r0, asr #3      ; >> 3
+    strh        r0, [r1, #16]       ; op[8]
+
+
+    ; op[3,7,11,15]
+    add         r0, r3, r7          ; a1_3 = A3 + C3
+    sub         r3, r3, r7          ; b1_3 = A3 - C3
+
+    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
+    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
+    add         r7, r5, r9          ; d1_3 = B3 + D3
+    sub         r5, r5, r9          ; c1_3 = B3 - D3
+
+    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #6]        ; op[3]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #14]       ; op[7]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #22]       ; op[11]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
+    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #30]       ; op[15]
+
+    ; op[1,5,9,13]
+    add         r0, r3, r5          ; a1_1 = A1 + C1
+    sub         r3, r3, r5          ; b1_1 = A1 - C1
+
+    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
+    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
+    add         r5, r7, r9          ; d1_1 = B1 + D1
+    sub         r7, r7, r9          ; c1_1 = B1 - D1
+
+    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #2]        ; op[1]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #10]       ; op[5]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #18]       ; op[9]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
+    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #26]       ; op[13]
+
+
+    ; op[2,6,10,14]
+    add         r11, r4, r8         ; a1_2 = A2 + C2
+    sub         r12, r4, r8         ; b1_2 = A2 - C2
+
+    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
+    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
+    add         r4, r6, r10         ; d1_2 = B2 + D2
+    sub         r8, r6, r10         ; c1_2 = B2 - D2
+
+    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #4]        ; op[2]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #12]       ; op[6]
+
+    addmi       r2, r2, #1          ; += a2 < 0
+    add         r2, r2, #3          ; += 3
+    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
+    mov         r2, r2, asr #3      ; >> 3
+    strh        r2, [r1, #20]       ; op[10]
+
+    addmi       r9, r9, #1          ; += a2 < 0
+    add         r9, r9, #3          ; += 3
+    mov         r9, r9, asr #3      ; >> 3
+    strh        r9, [r1, #28]       ; op[14]
+
 
     ldmia       sp!, {r4 - r11, pc}
     ENDP        ; |vp8_short_walsh4x4_armv6|
 
+c00040004
+    DCD         0x00040004
+
     END
index 60d649d..2692acb 100644 (file)
 
 #if HAVE_ARMV6
 
-void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch)
 {
-    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
-    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+    vp8_short_fdct4x4_armv6(input,   output,    pitch);
+    vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
 }
 
 #endif /* HAVE_ARMV6 */
-
-
index 05b807d..db553c4 100644 (file)
 
 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
-extern prototype_fdct(vp8_fast_fdct4x4_armv6);
-extern prototype_fdct(vp8_fast_fdct8x4_armv6);
+extern prototype_fdct(vp8_short_fdct4x4_armv6);
+extern prototype_fdct(vp8_short_fdct8x4_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
 
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_armv6
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_armv6
+
 #undef  vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+#define vp8_fdct_fast4x4 vp8_short_fdct4x4_armv6
 
 #undef  vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
+#define vp8_fdct_fast8x4 vp8_short_fdct8x4_armv6
 #endif
 
 #endif /* HAVE_ARMV6 */
index ba3decf..2226629 100644 (file)
     PRESERVE8
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
-
+;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
+; r0   short *input,
+; r1   short *output,
+; r2   int pitch
 |vp8_short_walsh4x4_neon| PROC
-    vld1.16         {d2}, [r0], r2              ;load input
-    vld1.16         {d3}, [r0], r2
-    vld1.16         {d4}, [r0], r2
-    vld1.16         {d5}, [r0], r2
 
-    ;First for-loop
-    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
-    vtrn.16         d2, d3
-    vtrn.16         d4, d5
+    vld1.16         {d0}, [r0@64], r2   ; load input
+    vld1.16         {d1}, [r0@64], r2
+    vld1.16         {d2}, [r0@64], r2
+    vld1.16         {d3}, [r0@64]
 
-    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[3]
-    vadd.s16        d7, d3, d4              ;b1 = ip[1]+ip[2]
-    vsub.s16        d8, d3, d4              ;c1 = ip[1]-ip[2]
-    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[3]
+    ;First for-loop
+    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
+    vtrn.32         d0, d2
+    vtrn.32         d1, d3
 
-    vadd.s16        d2, d6, d7             ;op[0] = a1 + b1
-    vsub.s16        d4, d6, d7             ;op[2] = a1 - b1
-    vadd.s16        d3, d8, d9             ;op[1] = c1 + d1
-    vsub.s16        d5, d9, d8             ;op[3] = d1 - c1
+    vmov.s32        q15, #3             ; add 3 to all values
 
-    ;Second for-loop
-    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
-    vtrn.32         d2, d4
-    vtrn.32         d3, d5
+    vtrn.16         d0, d1
     vtrn.16         d2, d3
-    vtrn.16         d4, d5
 
-    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
-    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
-    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
-    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
+    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
+    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
+    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
+    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
 
-    vadd.s16        d2, d6, d7              ;a2 = a1 + b1;
-    vsub.s16        d4, d6, d7              ;c2 = a1 - b1;
-    vadd.s16        d3, d8, d9              ;b2 = c1 + d1;
-    vsub.s16        d5, d9, d8              ;d2 = d1 - c1;
+    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
+    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
+    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
+    vceq.s16        d16, d4, #0         ; a1 == 0
+    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
 
-    vcgt.s16        q3, q1, #0
-    vcgt.s16        q4, q2, #0
+    vadd.s16        d0, d4, d5          ; a1 + d1
+    vmvn            d16, d16            ; a1 != 0
+    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
+    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
+    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
+    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
 
-    vsub.s16        q1, q1, q3
-    vsub.s16        q2, q2, q4
-
-    vshr.s16        q1, q1, #1
-    vshr.s16        q2, q2, #1
-
-    vst1.16         {q1, q2}, [r1]
+    ;Second for-loop
+    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    vtrn.32         d1, d3
+    vtrn.32         d0, d2
+    vtrn.16         d2, d3
+    vtrn.16         d0, d1
+
+    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
+    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
+    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
+    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
+
+    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
+    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
+    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
+    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
+
+    vclt.s32        q8, q0, #0
+    vclt.s32        q9, q1, #0
+    vclt.s32        q10, q2, #0
+    vclt.s32        q11, q3, #0
+
+    ; subtract -1 (or 0)
+    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
+    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
+    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
+    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
+
+    vadd.s32        q8, q0, q15         ; a2 + 3
+    vadd.s32        q9, q1, q15         ; b2 + 3
+    vadd.s32        q10, q2, q15        ; c2 + 3
+    vadd.s32        q11, q3, q15        ; d2 + 3
+
+    ; vrshrn? would add 1 << 3-1 = 2
+    vshrn.s32       d0, q8, #3
+    vshrn.s32       d1, q9, #3
+    vshrn.s32       d2, q10, #3
+    vshrn.s32       d3, q11, #3
+
+    vst1.16         {q0, q1}, [r1@128]
 
     bx              lr
 
index ae8b38c..99b2688 100644 (file)
@@ -36,7 +36,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_ar
 #File list for armv6
 # encoder
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)