From: John Koleszar Date: Thu, 11 Apr 2013 20:29:37 +0000 (-0700) Subject: Remove unused vp9 ppc files X-Git-Tag: v1.3.0~1106^2~276^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=633d9e7b4f3b8dfe67054a27db33bec3e9699ab0;p=platform%2Fupstream%2Flibvpx.git Remove unused vp9 ppc files Change-Id: I3fe8c529ddec658cfa2376cfc05d9c8a5366e978 --- diff --git a/vp9/common/ppc/vp9_copy_altivec.asm b/vp9/common/ppc/vp9_copy_altivec.asm deleted file mode 100644 index a4ce915..0000000 --- a/vp9/common/ppc/vp9_copy_altivec.asm +++ /dev/null @@ -1,47 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl copy_mem16x16_ppc - -;# r3 unsigned char *src -;# r4 int src_stride -;# r5 unsigned char *dst -;# r6 int dst_stride - -;# Make the assumption that input will not be aligned, -;# but the output will be. So two reads and a perm -;# for the input, but only one store for the output. -copy_mem16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xe000 - mtspr 256, r12 ;# set VRSAVE - - li r10, 16 - mtctr r10 - -cp_16x16_loop: - lvsl v0, 0, r3 ;# permutate value for alignment - - lvx v1, 0, r3 - lvx v2, r10, r3 - - vperm v1, v1, v2, v0 - - stvx v1, 0, r5 - - add r3, r3, r4 ;# increment source pointer - add r5, r5, r6 ;# increment destination pointer - - bdnz cp_16x16_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp9/common/ppc/vp9_filter_altivec.asm b/vp9/common/ppc/vp9_filter_altivec.asm deleted file mode 100644 index 4da2e94..0000000 --- a/vp9/common/ppc/vp9_filter_altivec.asm +++ /dev/null @@ -1,1013 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl sixtap_predict_ppc - .globl sixtap_predict8x4_ppc - .globl sixtap_predict8x8_ppc - .globl sixtap_predict16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_hfilter V0, V1 - load_c \V0, HFilter, r5, r9, r10 - - addi r5, r5, 16 - lvx \V1, r5, r10 -.endm - -;# Vertical filtering -.macro Vprolog - load_c v0, VFilter, r6, r3, r10 - - vspltish v5, 8 - vspltish v6, 3 - vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v1, v0, 1 - vspltb v2, v0, 2 - vspltb v3, v0, 3 - vspltb v4, v0, 4 - vspltb v5, v0, 5 - vspltb v0, v0, 0 -.endm - -.macro vpre_load - Vprolog - li r10, 16 - lvx v10, 0, r9 ;# v10..v14 = first 5 rows - lvx v11, r10, r9 - addi r9, r9, 32 - lvx v12, 0, r9 - lvx v13, r10, r9 - addi r9, r9, 32 - lvx v14, 0, r9 -.endm - -.macro Msum Re, Ro, V, T, TMP - ;# (Re,Ro) += (V*T) - vmuleub \TMP, \V, \T ;# trashes v8 - vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary - vmuloub \TMP, \V, \T - vadduhm \Ro, \Ro, \TMP ;# Ro = odds -.endm - -.macro vinterp_no_store P0 P1 P2 P3 P4 P5 - vmuleub v8, \P0, v0 ;# 64 + 4 positive taps - vadduhm v16, v6, v8 - vmuloub v8, \P0, v0 - vadduhm v17, v6, v8 - Msum v16, v17, \P2, v2, v8 - Msum v16, v17, \P3, v3, v8 - Msum v16, v17, \P5, v5, v8 - - vmuleub v18, \P1, v1 ;# 2 negative taps - vmuloub v19, \P1, v1 - Msum v18, v19, \P4, v4, v8 - - vsubuhs v16, v16, v18 ;# subtract neg from pos - vsubuhs v17, v17, v19 - vsrh v16, v16, v7 ;# divide by 128 - vsrh v17, v17, v7 ;# v16 v17 = evens, odds - vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order - vmrglh v19, v16, v17 - vpkuhus \P0, v18, v19 ;# P0 = 8-bit result -.endm - -.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 - vmuleub v24, \P0, v13 ;# 64 + 4 positive taps - vadduhm v21, v20, v24 - vmuloub v24, \P0, v13 - vadduhm v22, v20, v24 - Msum v21, v22, \P2, v15, v25 - Msum v21, v22, \P3, v16, v25 - Msum v21, v22, \P5, v18, v25 - - vmuleub v23, \P1, v14 ;# 2 negative taps - vmuloub v24, \P1, v14 - Msum v23, v24, \P4, v17, v25 - - vsubuhs v21, v21, v23 ;# subtract neg from pos - vsubuhs v22, v22, v24 - vsrh v21, v21, v19 ;# divide by 128 - vsrh v22, v22, v19 ;# v16 v17 = evens, odds - vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order - vmrglh v24, v21, v22 - vpkuhus \P0, v23, v24 ;# P0 = 8-bit result -.endm - - -.macro Vinterp P0 P1 P2 P3 P4 P5 - vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 - stvx \P0, 0, r7 - add r7, r7, r8 ;# 33 ops per 16 pels -.endm - - -.macro luma_v P0, P1, P2, P3, P4, P5 - addi r9, r9, 16 ;# P5 = newest input row - lvx \P5, 0, r9 - Vinterp \P0, \P1, \P2, \P3, \P4, \P5 -.endm - -.macro luma_vtwo - luma_v v10, v11, v12, v13, v14, v15 - luma_v v11, v12, v13, v14, v15, v10 -.endm - -.macro luma_vfour - luma_vtwo - luma_v v12, v13, v14, v15, v10, v11 - luma_v v13, v14, v15, v10, v11, v12 -.endm - -.macro luma_vsix - luma_vfour - luma_v v14, v15, v10, v11, v12, v13 - luma_v v15, v10, v11, v12, v13, v14 -.endm - -.macro Interp4 R I I4 - vmsummbm \R, v13, \I, v15 - vmsummbm \R, v14, \I4, \R -.endm - -.macro Read8x8 VD, RS, RP, increment_counter - lvsl v21, 0, \RS ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx \VD, 0, \RS - lvx v20, r10, \RS - -.if \increment_counter - add \RS, \RS, \RP -.endif - - vperm \VD, \VD, v20, v21 -.endm - -.macro interp_8x8 R - vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 - vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A - Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 - vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx - Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 - - vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 - vsrh \R, \R, v19 - - vpkuhus \R, \R, \R ;# saturate and pack - -.endm - -.macro Read4x4 VD, RS, RP, increment_counter - lvsl v21, 0, \RS ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v20, 0, \RS - -.if \increment_counter - add \RS, \RS, \RP -.endif - - vperm \VD, v20, v20, v21 -.endm - .text - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -sixtap_predict_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xff87 - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- vertical_only_4x4 - - ;# load up horizontal filter - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_4x4 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - - b second_pass_4x4 - -vertical_only_4x4: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_4x4: - load_c v20, b_hilo_4x4, 0, r9, r10 - load_c v21, b_hilo, 0, r9, r10 - - ;# reposition input so that it can go through the - ;# filtering phase with one pass. - vperm v0, v0, v1, v20 ;# 0 1 x x - vperm v2, v2, v3, v20 ;# 2 3 x x - vperm v4, v4, v5, v20 ;# 4 5 x x - vperm v6, v6, v7, v20 ;# 6 7 x x - - vperm v0, v0, v2, v21 ;# 0 1 2 3 - vperm v4, v4, v6, v21 ;# 4 5 6 7 - - vsldoi v1, v0, v4, 4 - vsldoi v2, v0, v4, 8 - vsldoi v3, v0, v4, 12 - - vsldoi v5, v4, v8, 4 - - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - - stvx v0, 0, r1 - - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 4(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 8(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 12(r1) - stw r0, 0(r7) - - b exit_4x4 - -store_4x4: - - stvx v2, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v3, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v4, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v5, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - -exit_4x4: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro w_8x8 V, D, R, P - stvx \V, 0, r1 - lwz \R, 0(r1) - stw \R, 0(r7) - lwz \R, 4(r1) - stw \R, 4(r7) - add \D, \D, \P -.endm - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -sixtap_predict8x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- second_pass_pre_copy_8x4 - - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_8x4 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - - b second_pass_8x4 - -second_pass_pre_copy_8x4: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_8x4: - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 - vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 - vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x4 - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - - b exit_8x4 - -store_aligned_8x4: - - load_c v10, b_hilo, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - - b exit_8x4 - -store_8x4: - cmpi cr0, r8, 8 - beq cr0, store_aligned2_8x4 - - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - - b exit_8x4 - -store_aligned2_8x4: - load_c v10, b_hilo, 0, r9, r10 - - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - -exit_8x4: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -;# Because the width that needs to be filtered will fit in a single altivec -;# register there is no need to loop. Everything can stay in registers. -sixtap_predict8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- second_pass_pre_copy_8x8 - - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - Read8x8 v9, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - interp_8x8 v9 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_8x8 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v10, r3, r4, 1 - Read8x8 v11, r3, r4, 1 - Read8x8 v12, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v10 - interp_8x8 v11 - interp_8x8 v12 - - b second_pass_8x8 - -second_pass_pre_copy_8x8: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - Read8x8 v9, r3, r4, 1 - Read8x8 v10, r3, r4, 1 - Read8x8 v11, r3, r4, 1 - Read8x8 v12, r3, r4, 0 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_8x8: - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 - vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 - vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 - vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 - vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 - vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 - vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x8 - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - - b exit_8x8 - -store_aligned_8x8: - - load_c v10, b_hilo, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - - b exit_8x8 - -store_8x8: - cmpi cr0, r8, 8 - beq cr0, store_aligned2_8x8 - - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - w_8x8 v8, r7, r0, r8 - w_8x8 v9, r7, r0, r8 - - b exit_8x8 - -store_aligned2_8x8: - load_c v10, b_hilo, 0, r9, r10 - - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - vperm v8, v8, v9, v10 - - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - addi r7, r7, 16 - stvx v8, 0, r7 - -exit_8x8: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -;# Two pass filtering. First pass is Horizontal edges, second pass is vertical -;# edges. One of the filters can be null, but both won't be. Needs to use a -;# temporary buffer because the source buffer can't be modified and the buffer -;# for the destination is not large enough to hold the temporary data. -sixtap_predict16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xf000 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-416(r1) ;# create space on the stack - - ;# Three possiblities - ;# 1. First filter is null. Don't use a temp buffer. - ;# 2. Second filter is null. Don't use a temp buffer. - ;# 3. Neither are null, use temp buffer. - - ;# First Pass (horizontal edge) - ;# setup pointers for src - ;# if possiblity (1) then setup the src pointer to be the orginal and jump - ;# to second pass. this is based on if x_offset is 0. - - ;# load up horizontal filter - slwi. r5, r5, 5 ;# index into horizontal filter array - - load_hfilter v4, v5 - - beq- copy_horizontal_16x21 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# setup constants - ;# v14 permutation value for alignment - load_c v14, b_hperm, 0, r9, r10 - - ;# These statements are guessing that there won't be a second pass, - ;# but if there is then inside the bypass they need to be set - li r0, 16 ;# prepare for no vertical filter - - ;# Change the output pointer and pitch to be the actual - ;# desination instead of a temporary buffer. - addi r9, r7, 0 - addi r5, r8, 0 - - ;# no vertical filter, so write the output from the first pass - ;# directly into the output buffer. - beq- no_vertical_filter_bypass - - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - - ;# setup counter for the number of lines that are going to be filtered - li r0, 21 - - ;# use the stack as temporary storage - la r9, 48(r1) - li r5, 16 - -no_vertical_filter_bypass: - - mtctr r0 - - ;# rounding added in on the multiply - vspltisw v10, 8 - vspltisw v12, 3 - vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v13, 7 - - ;# index to the next set of vectors in the row. - li r10, 16 - li r12, 32 - -horizontal_loop_16x16: - - lvsl v15, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v1, 0, r3 - lvx v2, r10, r3 - lvx v3, r12, r3 - - vperm v8, v1, v2, v15 - vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified - - vsldoi v11, v8, v9, 4 - - ;# set 0 - vmsummbm v6, v4, v8, v12 ;# taps times elements - vmsummbm v0, v5, v11, v6 - - ;# set 1 - vsldoi v10, v8, v9, 1 - vsldoi v11, v8, v9, 5 - - vmsummbm v6, v4, v10, v12 - vmsummbm v1, v5, v11, v6 - - ;# set 2 - vsldoi v10, v8, v9, 2 - vsldoi v11, v8, v9, 6 - - vmsummbm v6, v4, v10, v12 - vmsummbm v2, v5, v11, v6 - - ;# set 3 - vsldoi v10, v8, v9, 3 - vsldoi v11, v8, v9, 7 - - vmsummbm v6, v4, v10, v12 - vmsummbm v3, v5, v11, v6 - - vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F - - vsrh v0, v0, v13 ;# divide v0, v1 by 128 - vsrh v1, v1, v13 - - vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result - vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result - - stvx v0, 0, r9 - add r9, r9, r5 - - add r3, r3, r4 - - bdnz horizontal_loop_16x16 - - ;# check again to see if vertical filter needs to be done. - cmpi cr0, r6, 0 - beq cr0, end_16x16 - - ;# yes there is, so go to the second pass - b second_pass_16x16 - -copy_horizontal_16x21: - li r10, 21 - mtctr r10 - - li r10, 16 - - sub r3, r3, r4 - sub r3, r3, r4 - - ;# this is done above if there is a horizontal filter, - ;# if not it needs to be done down here. - slwi r6, r6, 4 ;# index into vertical filter array - - ;# always write to the stack when doing a horizontal copy - la r9, 48(r1) - -copy_horizontal_loop_16x21: - lvsl v15, 0, r3 ;# permutate value for alignment - - lvx v1, 0, r3 - lvx v2, r10, r3 - - vperm v8, v1, v2, v15 - - stvx v8, 0, r9 - addi r9, r9, 16 - - add r3, r3, r4 - - bdnz copy_horizontal_loop_16x21 - -second_pass_16x16: - - ;# always read from the stack when doing a vertical filter - la r9, 48(r1) - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v7, 7 - - vpre_load - - luma_vsix - luma_vsix - luma_vfour - -end_16x16: - - addi r1, r1, 416 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -HFilter: - .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 - .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 - .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 - .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 - .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 - .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 - .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 - .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 - .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 - .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 - .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 - .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 - .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 - .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 - - .align 4 -VFilter: - .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - - .align 4 -b_hperm: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -B_0123: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -B_4567: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - - .align 4 -B_89AB: - .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - - .align 4 -b_hilo: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - - .align 4 -b_hilo_4x4: - .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 diff --git a/vp9/common/ppc/vp9_filter_bilinear_altivec.asm b/vp9/common/ppc/vp9_filter_bilinear_altivec.asm deleted file mode 100644 index fd8aa66..0000000 --- a/vp9/common/ppc/vp9_filter_bilinear_altivec.asm +++ /dev/null @@ -1,677 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl bilinear_predict4x4_ppc - .globl bilinear_predict8x4_ppc - .globl bilinear_predict8x8_ppc - .globl bilinear_predict16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_vfilter V0, V1 - load_c \V0, vfilter_b, r6, r9, r10 - - addi r6, r6, 16 - lvx \V1, r6, r10 -.endm - -.macro HProlog jump_label - ;# load up horizontal filter - slwi. r5, r5, 4 ;# index into horizontal filter array - - ;# index to the next set of vectors in the row. - li r10, 16 - li r12, 32 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq \jump_label - - load_c v20, hfilter_b, r5, r9, r0 - - ;# setup constants - ;# v14 permutation value for alignment - load_c v28, b_hperm_b, 0, r9, r0 - - ;# rounding added in on the multiply - vspltisw v21, 8 - vspltisw v18, 3 - vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 - - slwi. r6, r6, 5 ;# index into vertical filter array -.endm - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro HFilter V - vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 - vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A - - vmsummbm v24, v20, v24, v18 - vmsummbm v25, v20, v25, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - - vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result -.endm - -.macro hfilter_8 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 9 bytes wide, output is 8 bytes. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - - HFilter \V -.endm - - -.macro load_and_align_8 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - - vperm \V, v21, v22, v17 -.endm - -.macro write_aligned_8 V, increment_counter - stvx \V, 0, r7 - -.if \increment_counter - add r7, r7, r8 -.endif -.endm - -.macro vfilter_16 P0 P1 - vmuleub v22, \P0, v20 ;# 64 + 4 positive taps - vadduhm v22, v18, v22 - vmuloub v23, \P0, v20 - vadduhm v23, v18, v23 - - vmuleub v24, \P1, v21 - vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary - vmuloub v25, \P1, v21 - vadduhm v23, v23, v25 ;# Ro = odds - - vsrh v22, v22, v19 ;# divide by 128 - vsrh v23, v23, v19 ;# v16 v17 = evens, odds - vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order - vmrglh v23, v22, v23 - vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result -.endm - - -.macro w_8x8 V, D, R, P - stvx \V, 0, r1 - lwz \R, 0(r1) - stw \R, 0(r7) - lwz \R, 4(r1) - stw \R, 4(r7) - add \D, \D, \P -.endm - - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict4x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_4x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_4x4_b - - hfilter_8 v4, 0 - - b second_pass_4x4_b - -second_pass_4x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - -second_pass_4x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -store_out_4x4_b: - - stvx v0, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v1, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v2, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v3, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - -exit_4x4: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict8x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_8x4_b - - hfilter_8 v4, 0 - - b second_pass_8x4_b - -second_pass_8x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - -second_pass_8x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -store_out_8x4_b: - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x4_b - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - - b exit_8x4 - -store_aligned_8x4_b: - load_c v10, b_hilo_b, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - -exit_8x4: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff0 - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x8_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - hfilter_8 v4, 1 - hfilter_8 v5, 1 - hfilter_8 v6, 1 - hfilter_8 v7, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_8x8_b - - hfilter_8 v8, 0 - - b second_pass_8x8_b - -second_pass_8x8_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - load_and_align_8 v5, 1 - load_and_align_8 v6, 1 - load_and_align_8 v7, 1 - load_and_align_8 v8, 0 - -second_pass_8x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -store_out_8x8_b: - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x8_b - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - - b exit_8x8 - -store_aligned_8x8_b: - load_c v10, b_hilo_b, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - -exit_8x8: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro hfilter_16 V, increment_counter - - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - lvx v23, r12, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified - - ;# set 0 - vmsummbm v24, v20, v21, v18 ;# taps times elements - - ;# set 1 - vsldoi v23, v21, v22, 1 - vmsummbm v25, v20, v23, v18 - - ;# set 2 - vsldoi v23, v21, v22, 2 - vmsummbm v26, v20, v23, v18 - - ;# set 3 - vsldoi v23, v21, v22, 3 - vmsummbm v27, v20, v23, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - vsrh v25, v25, v19 - - vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result - vperm \V, \V, v0, v28 ;# \V = correctly-ordered result -.endm - -.macro load_and_align_16 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - - vperm \V, v21, v22, v17 -.endm - -.macro write_16 V, increment_counter - stvx \V, 0, r7 - -.if \increment_counter - add r7, r7, r8 -.endif -.endm - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - HProlog second_pass_16x16_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - hfilter_16 v8, 1 - hfilter_16 v9, 1 - hfilter_16 v10, 1 - hfilter_16 v11, 1 - hfilter_16 v12, 1 - hfilter_16 v13, 1 - hfilter_16 v14, 1 - hfilter_16 v15, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_16x16_b - - hfilter_16 v16, 0 - - b second_pass_16x16_b - -second_pass_16x16_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, 1 - load_and_align_16 v1, 1 - load_and_align_16 v2, 1 - load_and_align_16 v3, 1 - load_and_align_16 v4, 1 - load_and_align_16 v5, 1 - load_and_align_16 v6, 1 - load_and_align_16 v7, 1 - load_and_align_16 v8, 1 - load_and_align_16 v9, 1 - load_and_align_16 v10, 1 - load_and_align_16 v11, 1 - load_and_align_16 v12, 1 - load_and_align_16 v13, 1 - load_and_align_16 v14, 1 - load_and_align_16 v15, 1 - load_and_align_16 v16, 0 - -second_pass_16x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -store_out_16x16_b: - - write_16 v0, 1 - write_16 v1, 1 - write_16 v2, 1 - write_16 v3, 1 - write_16 v4, 1 - write_16 v5, 1 - write_16 v6, 1 - write_16 v7, 1 - write_16 v8, 1 - write_16 v9, 1 - write_16 v10, 1 - write_16 v11, 1 - write_16 v12, 1 - write_16 v13, 1 - write_16 v14, 1 - write_16 v15, 0 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -hfilter_b: - .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 - .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 - .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 - .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 - .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 - .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 - .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 - .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 - - .align 4 -vfilter_b: - .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - - .align 4 -b_hperm_b: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -b_0123_b: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -b_4567_b: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - -b_hilo_b: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp9/common/ppc/vp9_idct_altivec.asm b/vp9/common/ppc/vp9_idct_altivec.asm deleted file mode 100644 index b87aa42..0000000 --- a/vp9/common/ppc/vp9_idct_altivec.asm +++ /dev/null @@ -1,189 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl short_idct4x4_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -;# r3 short *input -;# r4 short *output -;# r5 int pitch - .align 2 -short_idct4x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - load_c v8, sinpi8sqrt2, 0, r9, r10 - load_c v9, cospi8sqrt2minus1, 0, r9, r10 - load_c v10, hi_hi, 0, r9, r10 - load_c v11, lo_lo, 0, r9, r10 - load_c v12, shift_16, 0, r9, r10 - - li r10, 16 - lvx v0, 0, r3 ;# input ip[0], ip[ 4] - lvx v1, r10, r3 ;# input ip[8], ip[12] - - ;# first pass - vupkhsh v2, v0 - vupkhsh v3, v1 - vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] - vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] - - vupklsh v0, v0 - vmulosh v4, v0, v8 - vsraw v4, v4, v12 - vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) - - vupklsh v1, v1 - vmulosh v5, v1, v9 - vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v1 - - vsubsws v4, v4, v5 ;# c1 - - vmulosh v3, v1, v8 - vsraw v3, v3, v12 - vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) - - vmulosh v5, v0, v9 - vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v0 - - vaddsws v3, v3, v5 ;# d1 - - vaddsws v0, v6, v3 ;# a1 + d1 - vsubsws v3, v6, v3 ;# a1 - d1 - - vaddsws v1, v7, v4 ;# b1 + c1 - vsubsws v2, v7, v4 ;# b1 - c1 - - ;# transpose input - vmrghw v4, v0, v1 ;# a0 b0 a1 b1 - vmrghw v5, v2, v3 ;# c0 d0 c1 d1 - - vmrglw v6, v0, v1 ;# a2 b2 a3 b3 - vmrglw v7, v2, v3 ;# c2 d2 c3 d3 - - vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 - vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 - - vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 - vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 - - ;# second pass - vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] - vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] - - vmulosh v4, v1, v8 - vsraw v4, v4, v12 - vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) - - vmulosh v5, v3, v9 - vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v3 - - vsubsws v4, v4, v5 ;# c1 - - vmulosh v2, v3, v8 - vsraw v2, v2, v12 - vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) - - vmulosh v5, v1, v9 - vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v1 - - vaddsws v3, v2, v5 ;# d1 - - vaddsws v0, v6, v3 ;# a1 + d1 - vsubsws v3, v6, v3 ;# a1 - d1 - - vaddsws v1, v7, v4 ;# b1 + c1 - vsubsws v2, v7, v4 ;# b1 - c1 - - vspltish v6, 4 - vspltish v7, 3 - - vpkswss v0, v0, v1 - vpkswss v1, v2, v3 - - vaddshs v0, v0, v6 - vaddshs v1, v1, v6 - - vsrah v0, v0, v7 - vsrah v1, v1, v7 - - ;# transpose output - vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 - vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 - - vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 - vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 - - stwu r1,-416(r1) ;# create space on the stack - - stvx v0, 0, r1 - lwz r6, 0(r1) - stw r6, 0(r4) - lwz r6, 4(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - lwz r6, 8(r1) - stw r6, 0(r4) - lwz r6, 12(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - stvx v1, 0, r1 - lwz r6, 0(r1) - stw r6, 0(r4) - lwz r6, 4(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - lwz r6, 8(r1) - stw r6, 0(r4) - lwz r6, 12(r1) - stw r6, 4(r4) - - addi r1, r1, 416 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 4 -sinpi8sqrt2: - .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 - - .align 4 -cospi8sqrt2minus1: - .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 - - .align 4 -shift_16: - .long 16, 16, 16, 16 - - .align 4 -hi_hi: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - - .align 4 -lo_lo: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/vp9/common/ppc/vp9_loopfilter_altivec.c b/vp9/common/ppc/vp9_loopfilter_altivec.c deleted file mode 100644 index 599070a..0000000 --- a/vp9/common/ppc/vp9_loopfilter_altivec.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_loopfilter.h" -#include "vp9/common/vp9_onyxc_int.h" - -typedef void loop_filter_function_y_ppc -( - unsigned char *s, // source pointer - int p, // pitch - const signed char *flimit, - const signed char *limit, - const signed char *thresh -); - -typedef void loop_filter_function_uv_ppc -( - unsigned char *u, // source pointer - unsigned char *v, // source pointer - int p, // pitch - const signed char *flimit, - const signed char *limit, - const signed char *thresh -); - -typedef void loop_filter_function_s_ppc -( - unsigned char *s, // source pointer - int p, // pitch - const signed char *flimit -); - -loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc; -loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc; -loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc; -loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc; - -loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc; -loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc; -loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc; -loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc; - -loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc; -loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; - -// Horizontal MB filtering -void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); - - if (u_ptr) - mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); -} - -void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim); -} - -// Vertical MB Filtering -void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); - - if (u_ptr) - mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); -} - -void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim); -} - -// Horizontal B Filtering -void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - // These should all be done at once with one call, instead of 3 - loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - - if (u_ptr) - loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr); -} - -void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim); - loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim); - loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim); -} - -// Vertical B Filtering -void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); - - if (u_ptr) - loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr); -} - -void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim); - loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim); - loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim); -} diff --git a/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm b/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm deleted file mode 100644 index 61df4e9..0000000 --- a/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm +++ /dev/null @@ -1,1253 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl mbloop_filter_horizontal_edge_y_ppc - .globl loop_filter_horizontal_edge_y_ppc - .globl mbloop_filter_vertical_edge_y_ppc - .globl loop_filter_vertical_edge_y_ppc - - .globl mbloop_filter_horizontal_edge_uv_ppc - .globl loop_filter_horizontal_edge_uv_ppc - .globl mbloop_filter_vertical_edge_uv_ppc - .globl loop_filter_vertical_edge_uv_ppc - - .globl loop_filter_simple_horizontal_edge_ppc - .globl loop_filter_simple_vertical_edge_ppc - - .text -;# We often need to perform transposes (and other transpose-like operations) -;# on matrices of data. This is simplified by the fact that we usually -;# operate on hunks of data whose dimensions are powers of 2, or at least -;# divisible by highish powers of 2. -;# -;# These operations can be very confusing. They become more straightforward -;# when we think of them as permutations of address bits: Concatenate a -;# group of vector registers and think of it as occupying a block of -;# memory beginning at address zero. The low four bits 0...3 of the -;# address then correspond to position within a register, the higher-order -;# address bits select the register. -;# -;# Although register selection, at the code level, is arbitrary, things -;# are simpler if we use contiguous ranges of register numbers, simpler -;# still if the low-order bits of the register number correspond to -;# conceptual address bits. We do this whenever reasonable. -;# -;# A 16x16 transpose can then be thought of as an operation on -;# a 256-element block of memory. It takes 8 bits 0...7 to address this -;# memory and the effect of a transpose is to interchange address bit -;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the -;# column, which is interchanged with the row addressed by bits 4..7. -;# -;# The altivec merge instructions provide a rapid means of effecting -;# many of these transforms. They operate at three widths (8,16,32). -;# Writing V(x) for vector register #x, paired merges permute address -;# indices as follows. -;# -;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: -;# -;# vmrghb V( x), V( y), V( y + (1<2 2->3 3->(4+d) (4+s)->1: -;# -;# vmrghh V( x), V( y), V( y + (1<3 3->(4+d) (4+s)->2: -;# -;# vmrghw V( x), V( y), V( y + (1<(4+d) (4+s)->3 by the sequence: -;# -;# vperm V( x), V( y), V( y + (1<4 1<->5 2<->6 3<->7, which we accomplish by -;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. -;# -;# Except for the fact that the destination registers get written -;# before we are done referencing the old contents, the cyclic transform -;# is effected by -;# -;# x = 0; do { -;# vmrghb V(2x), V(x), V(x+8); -;# vmrghb V(2x+1), V(x), V(x+8); -;# } while( ++x < 8); -;# -;# For clarity, and because we can afford it, we do this transpose -;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, -;# leaving the final result in 16 .. 31, as the lower registers are -;# used in the filtering itself. -;# -.macro Tpair A, B, X, Y - vmrghb \A, \X, \Y - vmrglb \B, \X, \Y -.endm - -;# Each step takes 8*2 = 16 instructions - -.macro t16_even - Tpair v16,v17, v0,v8 - Tpair v18,v19, v1,v9 - Tpair v20,v21, v2,v10 - Tpair v22,v23, v3,v11 - Tpair v24,v25, v4,v12 - Tpair v26,v27, v5,v13 - Tpair v28,v29, v6,v14 - Tpair v30,v31, v7,v15 -.endm - -.macro t16_odd - Tpair v0,v1, v16,v24 - Tpair v2,v3, v17,v25 - Tpair v4,v5, v18,v26 - Tpair v6,v7, v19,v27 - Tpair v8,v9, v20,v28 - Tpair v10,v11, v21,v29 - Tpair v12,v13, v22,v30 - Tpair v14,v15, v23,v31 -.endm - -;# Whole transpose takes 4*16 = 64 instructions - -.macro t16_full - t16_odd - t16_even - t16_odd - t16_even -.endm - -;# Vertical edge filtering requires transposes. For the simple filter, -;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels -;# each. Writing 0 ... 63 for the pixel indices, the desired result is: -;# -;# v0 = 0 1 ... 14 15 -;# v1 = 16 17 ... 30 31 -;# v2 = 32 33 ... 47 48 -;# v3 = 49 50 ... 62 63 -;# -;# In frame-buffer memory, the layout is: -;# -;# 0 16 32 48 -;# 1 17 33 49 -;# ... -;# 15 31 47 63. -;# -;# We begin by reading the data 32 bits at a time (using scalar operations) -;# into a temporary array, reading the rows of the array into vector registers, -;# with the following layout: -;# -;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 -;# v1 = 1 17 33 49 5 21 ... 45 61 -;# v2 = 2 18 ... 46 62 -;# v3 = 3 19 ... 47 63 -;# -;# From the "address-bit" perspective discussed above, we simply need to -;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. -;# In other words, we transpose each of the four 4x4 submatrices. -;# -;# This transformation is its own inverse, and we need to perform it -;# again before writing the pixels back into the frame buffer. -;# -;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, -;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors -;# defined above. We think of both groups of 4 registers as having -;# "addresses" {0,1,2,3} * 16. -;# -.macro Transpose4times4x4 Vlo, Vhi - - ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= - - vmrghb v4, v0, v1 - vmrglb v5, v0, v1 - vmrghb v6, v2, v3 - vmrglb v7, v2, v3 - - ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 - - vmrghh v0, v4, v6 - vmrglh v1, v4, v6 - vmrghh v2, v5, v7 - vmrglh v3, v5, v7 - - ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= - - vmrghw v4, v0, v1 - vmrglw v5, v0, v1 - vmrghw v6, v2, v3 - vmrglw v7, v2, v3 - - ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 - - vperm v0, v4, v6, \Vlo - vperm v1, v4, v6, \Vhi - vperm v2, v5, v7, \Vlo - vperm v3, v5, v7, \Vhi -.endm -;# end Transpose4times4x4 - - -;# Normal mb vertical edge filter transpose. -;# -;# We read 8 columns of data, initially in the following pattern: -;# -;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) -;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) -;# ... -;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) -;# -;# and wish to convert to: -;# -;# (0,0) ... (0,15) -;# (1,0) ... (1,15) -;# ... -;# (7,0) ... (7,15). -;# -;# In "address bit" language, we wish to map -;# -;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. -;# -;# This can be accomplished by 4 iterations of the cyclic transform -;# -;# I -> (I+1) mod 7; -;# -;# each iteration can be realized by (d=0, s=2): -;# -;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); -;# -;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; -;# preserving v8 = sign converter. -;# -;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the -;# result lands in the "mirror" registers v10...v17 -;# -.macro t8x16_odd - Tpair v10, v11, v0, v4 - Tpair v12, v13, v1, v5 - Tpair v14, v15, v2, v6 - Tpair v16, v17, v3, v7 -.endm - -.macro t8x16_even - Tpair v0, v1, v10, v14 - Tpair v2, v3, v11, v15 - Tpair v4, v5, v12, v16 - Tpair v6, v7, v13, v17 -.endm - -.macro transpose8x16_fwd - t8x16_odd - t8x16_even - t8x16_odd - t8x16_even -.endm - -.macro transpose8x16_inv - t8x16_odd - t8x16_even - t8x16_odd -.endm - -.macro Transpose16x16 - vmrghb v0, v16, v24 - vmrglb v1, v16, v24 - vmrghb v2, v17, v25 - vmrglb v3, v17, v25 - vmrghb v4, v18, v26 - vmrglb v5, v18, v26 - vmrghb v6, v19, v27 - vmrglb v7, v19, v27 - vmrghb v8, v20, v28 - vmrglb v9, v20, v28 - vmrghb v10, v21, v29 - vmrglb v11, v21, v29 - vmrghb v12, v22, v30 - vmrglb v13, v22, v30 - vmrghb v14, v23, v31 - vmrglb v15, v23, v31 - vmrghb v16, v0, v8 - vmrglb v17, v0, v8 - vmrghb v18, v1, v9 - vmrglb v19, v1, v9 - vmrghb v20, v2, v10 - vmrglb v21, v2, v10 - vmrghb v22, v3, v11 - vmrglb v23, v3, v11 - vmrghb v24, v4, v12 - vmrglb v25, v4, v12 - vmrghb v26, v5, v13 - vmrglb v27, v5, v13 - vmrghb v28, v6, v14 - vmrglb v29, v6, v14 - vmrghb v30, v7, v15 - vmrglb v31, v7, v15 - vmrghb v0, v16, v24 - vmrglb v1, v16, v24 - vmrghb v2, v17, v25 - vmrglb v3, v17, v25 - vmrghb v4, v18, v26 - vmrglb v5, v18, v26 - vmrghb v6, v19, v27 - vmrglb v7, v19, v27 - vmrghb v8, v20, v28 - vmrglb v9, v20, v28 - vmrghb v10, v21, v29 - vmrglb v11, v21, v29 - vmrghb v12, v22, v30 - vmrglb v13, v22, v30 - vmrghb v14, v23, v31 - vmrglb v15, v23, v31 - vmrghb v16, v0, v8 - vmrglb v17, v0, v8 - vmrghb v18, v1, v9 - vmrglb v19, v1, v9 - vmrghb v20, v2, v10 - vmrglb v21, v2, v10 - vmrghb v22, v3, v11 - vmrglb v23, v3, v11 - vmrghb v24, v4, v12 - vmrglb v25, v4, v12 - vmrghb v26, v5, v13 - vmrglb v27, v5, v13 - vmrghb v28, v6, v14 - vmrglb v29, v6, v14 - vmrghb v30, v7, v15 - vmrglb v31, v7, v15 -.endm - -;# load_g loads a global vector (whose address is in the local variable Gptr) -;# into vector register Vreg. Trashes r0 -.macro load_g Vreg, Gptr - lwz r0, \Gptr - lvx \Vreg, 0, r0 -.endm - -;# exploit the saturation here. if the answer is negative -;# it will be clamped to 0. orring 0 with a positive -;# number will be the positive number (abs) -;# RES = abs( A-B), trashes TMP -.macro Abs RES, TMP, A, B - vsububs \RES, \A, \B - vsububs \TMP, \B, \A - vor \RES, \RES, \TMP -.endm - -;# RES = Max( RES, abs( A-B)), trashes TMP -.macro max_abs RES, TMP, A, B - vsububs \TMP, \A, \B - vmaxub \RES, \RES, \TMP - vsububs \TMP, \B, \A - vmaxub \RES, \RES, \TMP -.endm - -.macro Masks - ;# build masks - ;# input is all 8 bit unsigned (0-255). need to - ;# do abs(vala-valb) > limit. but no need to compare each - ;# value to the limit. find the max of the absolute differences - ;# and compare that to the limit. - ;# First hev - Abs v14, v13, v2, v3 ;# |P1 - P0| - max_abs v14, v13, v5, v4 ;# |Q1 - Q0| - - vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded - - ;# Next limit - max_abs v14, v13, v0, v1 ;# |P3 - P2| - max_abs v14, v13, v1, v2 ;# |P2 - P1| - max_abs v14, v13, v6, v5 ;# |Q2 - Q1| - max_abs v14, v13, v7, v6 ;# |Q3 - Q2| - - vcmpgtub v9, v14, v9 ;# R = true if limit exceeded - - ;# flimit - Abs v14, v13, v3, v4 ;# |P0 - Q0| - - vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded - - vor v8, v8, v9 ;# R = true if flimit or limit exceeded - ;# done building masks -.endm - -.macro build_constants RFL, RLI, RTH, FL, LI, TH - ;# build constants - lvx \FL, 0, \RFL ;# flimit - lvx \LI, 0, \RLI ;# limit - lvx \TH, 0, \RTH ;# thresh - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 -.endm - -.macro load_data_y - ;# setup strides/pointers to be able to access - ;# all of the data - add r5, r4, r4 ;# r5 = 2 * stride - sub r6, r3, r5 ;# r6 -> 2 rows back - neg r7, r4 ;# r7 = -stride - - ;# load 16 pixels worth of data to work on - sub r0, r6, r5 ;# r0 -> 4 rows back (temp) - lvx v0, 0, r0 ;# P3 (read only) - lvx v1, r7, r6 ;# P2 - lvx v2, 0, r6 ;# P1 - lvx v3, r7, r3 ;# P0 - lvx v4, 0, r3 ;# Q0 - lvx v5, r4, r3 ;# Q1 - lvx v6, r5, r3 ;# Q2 - add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) - lvx v7, r4, r0 ;# Q3 (read only) -.endm - -;# Expects -;# v10 == HEV -;# v13 == tmp -;# v14 == tmp -.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT - vxor \P1, \P1, v11 ;# SP1 - vxor \P0, \P0, v11 ;# SP0 - vxor \Q0, \Q0, v11 ;# SQ0 - vxor \Q1, \Q1, v11 ;# SQ1 - - vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) -.if \HEV_PRESENT - vand v13, v13, v10 ;# f &= hev -.endif - vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - - vandc v13, v13, v8 ;# f &= mask - - vspltisb v8, 3 - vspltisb v9, 4 - - vaddsbs v14, v13, v9 ;# f1 = c (f+4) - vaddsbs v15, v13, v8 ;# f2 = c (f+3) - - vsrab v13, v14, v8 ;# f1 >>= 3 - vsrab v15, v15, v8 ;# f2 >>= 3 - - vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) - vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) -.endm - -.macro vp8_mbfilter - Masks - - ;# start the fitering here - vxor v1, v1, v11 ;# SP2 - vxor v2, v2, v11 ;# SP1 - vxor v3, v3, v11 ;# SP0 - vxor v4, v4, v11 ;# SQ0 - vxor v5, v5, v11 ;# SQ1 - vxor v6, v6, v11 ;# SQ2 - - ;# add outer taps if we have high edge variance - vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) - - vsubsbs v14, v4, v3 ;# SQ0-SP0 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) - - vandc v13, v13, v8 ;# f &= mask - vand v15, v13, v10 ;# f2 = f & hev - - ;# save bottom 3 bits so that we round one side +4 and the other +3 - vspltisb v8, 3 - vspltisb v9, 4 - - vaddsbs v14, v15, v9 ;# f1 = c (f+4) - vaddsbs v15, v15, v8 ;# f2 = c (f+3) - - vsrab v14, v14, v8 ;# f1 >>= 3 - vsrab v15, v15, v8 ;# f2 >>= 3 - - vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) - vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) - - ;# only apply wider filter if not high edge variance - vandc v13, v13, v10 ;# f &= ~hev - - vspltisb v9, 2 - vnor v8, v8, v8 - vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f - vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f - vspltisb v8, 9 - - ;# roughly 1/7th difference across boundary - vspltish v10, 7 - vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v8, v13 - vaddshs v14, v14, v9 ;# += 63 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v6, v6, v10 ;# subtract from Q and add to P - vaddsbs v1, v1, v10 - - vxor v6, v6, v11 - vxor v1, v1, v11 - - ;# roughly 2/7th difference across boundary - vspltish v10, 7 - vaddubm v12, v8, v8 - vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v12, v13 - vaddshs v14, v14, v9 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v5, v5, v10 ;# subtract from Q and add to P - vaddsbs v2, v2, v10 - - vxor v5, v5, v11 - vxor v2, v2, v11 - - ;# roughly 3/7th difference across boundary - vspltish v10, 7 - vaddubm v12, v12, v8 - vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v12, v13 - vaddshs v14, v14, v9 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v4, v4, v10 ;# subtract from Q and add to P - vaddsbs v3, v3, v10 - - vxor v4, v4, v11 - vxor v3, v3, v11 -.endm - -.macro SBFilter - Masks - - common_adjust v3, v4, v2, v5, 1 - - ;# outer tap adjustments - vspltisb v8, 1 - - vaddubm v13, v13, v8 ;# f += 1 - vsrab v13, v13, v8 ;# f >>= 1 - - vandc v13, v13, v10 ;# f &= ~hev - - vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) - vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) - - vxor v2, v2, v11 - vxor v3, v3, v11 - vxor v4, v4, v11 - vxor v5, v5, v11 -.endm - - .align 2 -mbloop_filter_horizontal_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r5, r6, r7, v8, v9, v10 - - load_data_y - - vp8_mbfilter - - stvx v1, r7, r6 ;# P2 - stvx v2, 0, r6 ;# P1 - stvx v3, r7, r3 ;# P0 - stvx v4, 0, r3 ;# Q0 - stvx v5, r4, r3 ;# Q1 - stvx v6, r5, r3 ;# Q2 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -loop_filter_horizontal_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r5, r6, r7, v8, v9, v10 - - load_data_y - - SBFilter - - stvx v2, 0, r6 ;# P1 - stvx v3, r7, r3 ;# P0 - stvx v4, 0, r3 ;# Q0 - stvx v5, r4, r3 ;# Q1 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. -;# So we can read in an entire mb aligned. However if we want to filter the mb -;# edge we run into problems. For the loopfilter we require 4 bytes before the mb -;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit -;# of a waste. So this is an even uglier way to get around that. -;# Using the regular register file words are read in and then saved back out to -;# memory to align and order them up. Then they are read in using the -;# vector register file. -.macro RLVmb V, R - lwzux r0, r3, r4 - stw r0, 4(\R) - lwz r0,-4(r3) - stw r0, 0(\R) - lwzux r0, r3, r4 - stw r0,12(\R) - lwz r0,-4(r3) - stw r0, 8(\R) - lvx \V, 0, \R -.endm - -.macro WLVmb V, R - stvx \V, 0, \R - lwz r0,12(\R) - stwux r0, r3, r4 - lwz r0, 8(\R) - stw r0,-4(r3) - lwz r0, 4(\R) - stwux r0, r3, r4 - lwz r0, 0(\R) - stw r0,-4(r3) -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -mbloop_filter_vertical_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - sub r3, r3, r4 - - RLVmb v0, r9 - RLVmb v1, r9 - RLVmb v2, r9 - RLVmb v3, r9 - RLVmb v4, r9 - RLVmb v5, r9 - RLVmb v6, r9 - RLVmb v7, r9 - - transpose8x16_fwd - - build_constants r5, r6, r7, v8, v9, v10 - - vp8_mbfilter - - transpose8x16_inv - - add r3, r3, r4 - neg r4, r4 - - WLVmb v17, r9 - WLVmb v16, r9 - WLVmb v15, r9 - WLVmb v14, r9 - WLVmb v13, r9 - WLVmb v12, r9 - WLVmb v11, r9 - WLVmb v10, r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro RL V, R, P - lvx \V, 0, \R - add \R, \R, \P -.endm - -.macro WL V, R, P - stvx \V, 0, \R - add \R, \R, \P -.endm - -.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 - ;# K = |P0-P1| already - Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| - vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) - vcmpgtub v10, v14, v0 - - Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] - - max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) - max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) - max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) - - vmaxub v14, v14, v4 ;# M = max interior abs diff - vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded - - Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) - vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded - vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded - - ;# replace P1,Q1 w/signed versions - common_adjust \P0, \Q0, \P1, \Q1, 1 - - vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant - vsrab v13, v13, v1 - vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev - vsubsbs \Q1, \Q1, v13 - vaddsbs \P1, \P1, v13 - - vxor \P1, \P1, v11 ;# P1 - vxor \P0, \P0, v11 ;# P0 - vxor \Q0, \Q0, v11 ;# Q0 - vxor \Q1, \Q1, v11 ;# Q1 -.endm - - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -loop_filter_vertical_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - addi r9, r3, 0 - RL v16, r9, r4 - RL v17, r9, r4 - RL v18, r9, r4 - RL v19, r9, r4 - RL v20, r9, r4 - RL v21, r9, r4 - RL v22, r9, r4 - RL v23, r9, r4 - RL v24, r9, r4 - RL v25, r9, r4 - RL v26, r9, r4 - RL v27, r9, r4 - RL v28, r9, r4 - RL v29, r9, r4 - RL v30, r9, r4 - lvx v31, 0, r9 - - Transpose16x16 - - vspltisb v1, 1 - - build_constants r5, r6, r7, v3, v2, v0 - - Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| - - Fil v16, v17, v18, v19, v20, v21, v22, v23 - Fil v20, v21, v22, v23, v24, v25, v26, v27 - Fil v24, v25, v26, v27, v28, v29, v30, v31 - - Transpose16x16 - - addi r9, r3, 0 - WL v16, r9, r4 - WL v17, r9, r4 - WL v18, r9, r4 - WL v19, r9, r4 - WL v20, r9, r4 - WL v21, r9, r4 - WL v22, r9, r4 - WL v23, r9, r4 - WL v24, r9, r4 - WL v25, r9, r4 - WL v26, r9, r4 - WL v27, r9, r4 - WL v28, r9, r4 - WL v29, r9, r4 - WL v30, r9, r4 - stvx v31, 0, r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -.macro active_chroma_sel V - andi. r7, r3, 8 ;# row origin modulo 16 - add r7, r7, r7 ;# selects selectors - lis r12, _chromaSelectors@ha - la r0, _chromaSelectors@l(r12) - lwzux r0, r7, r0 ;# leave selector addr in r7 - - lvx \V, 0, r0 ;# mask to concatenate active U,V pels -.endm - -.macro hread_uv Dest, U, V, Offs, VMask - lvx \U, \Offs, r3 - lvx \V, \Offs, r4 - vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V -.endm - -.macro hwrite_uv New, U, V, Offs, Umask, Vmask - vperm \U, \New, \U, \Umask ;# Combine new pels with siblings - vperm \V, \New, \V, \Vmask - stvx \U, \Offs, r3 ;# Write to frame buffer - stvx \V, \Offs, r4 -.endm - -;# Process U,V in parallel. -.macro load_chroma_h - neg r9, r5 ;# r9 = -1 * stride - add r8, r9, r9 ;# r8 = -2 * stride - add r10, r5, r5 ;# r10 = 2 * stride - - active_chroma_sel v12 - - ;# P3, Q3 are read-only; need not save addresses or sibling pels - add r6, r8, r8 ;# r6 = -4 * stride - hread_uv v0, v14, v15, r6, v12 - add r6, r10, r5 ;# r6 = 3 * stride - hread_uv v7, v14, v15, r6, v12 - - ;# Others are read/write; save addresses and sibling pels - - add r6, r8, r9 ;# r6 = -3 * stride - hread_uv v1, v16, v17, r6, v12 - hread_uv v2, v18, v19, r8, v12 - hread_uv v3, v20, v21, r9, v12 - hread_uv v4, v22, v23, 0, v12 - hread_uv v5, v24, v25, r5, v12 - hread_uv v6, v26, v27, r10, v12 -.endm - -.macro uresult_sel V - load_g \V, 4(r7) -.endm - -.macro vresult_sel V - load_g \V, 8(r7) -.endm - -;# always write P1,P0,Q0,Q1 -.macro store_chroma_h - uresult_sel v11 - vresult_sel v12 - hwrite_uv v2, v18, v19, r8, v11, v12 - hwrite_uv v3, v20, v21, r9, v11, v12 - hwrite_uv v4, v22, v23, 0, v11, v12 - hwrite_uv v5, v24, v25, r5, v11, v12 -.endm - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -mbloop_filter_horizontal_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r6, r7, r8, v8, v9, v10 - - load_chroma_h - - vp8_mbfilter - - store_chroma_h - - hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 - hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -loop_filter_horizontal_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r6, r7, r8, v8, v9, v10 - - load_chroma_h - - SBFilter - - store_chroma_h - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro R V, R - lwzux r0, r3, r5 - stw r0, 4(\R) - lwz r0,-4(r3) - stw r0, 0(\R) - lwzux r0, r4, r5 - stw r0,12(\R) - lwz r0,-4(r4) - stw r0, 8(\R) - lvx \V, 0, \R -.endm - - -.macro W V, R - stvx \V, 0, \R - lwz r0,12(\R) - stwux r0, r4, r5 - lwz r0, 8(\R) - stw r0,-4(r4) - lwz r0, 4(\R) - stwux r0, r3, r5 - lwz r0, 0(\R) - stw r0,-4(r3) -.endm - -.macro chroma_vread R - sub r3, r3, r5 ;# back up one line for simplicity - sub r4, r4, r5 - - R v0, \R - R v1, \R - R v2, \R - R v3, \R - R v4, \R - R v5, \R - R v6, \R - R v7, \R - - transpose8x16_fwd -.endm - -.macro chroma_vwrite R - - transpose8x16_inv - - add r3, r3, r5 - add r4, r4, r5 - neg r5, r5 ;# Write rows back in reverse order - - W v17, \R - W v16, \R - W v15, \R - W v14, \R - W v13, \R - W v12, \R - W v11, \R - W v10, \R -.endm - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -mbloop_filter_vertical_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - - chroma_vread r9 - - build_constants r6, r7, r8, v8, v9, v10 - - vp8_mbfilter - - chroma_vwrite r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -loop_filter_vertical_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - - chroma_vread r9 - - build_constants r6, r7, r8, v8, v9, v10 - - SBFilter - - chroma_vwrite r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- - -.macro vp8_simple_filter - Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) - vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit - - ;# preserve unsigned v0 and v3 - common_adjust v1, v2, v0, v3, 0 - - vxor v1, v1, v11 - vxor v2, v2, v11 ;# cvt Q0, P0 back to pels -.endm - -.macro simple_vertical - addi r8, 0, 16 - addi r7, r5, 32 - - lvx v0, 0, r5 - lvx v1, r8, r5 - lvx v2, 0, r7 - lvx v3, r8, r7 - - lis r12, _B_hihi@ha - la r0, _B_hihi@l(r12) - lvx v16, 0, r0 - - lis r12, _B_lolo@ha - la r0, _B_lolo@l(r12) - lvx v17, 0, r0 - - Transpose4times4x4 v16, v17 - vp8_simple_filter - - vxor v0, v0, v11 - vxor v3, v3, v11 ;# cvt Q0, P0 back to pels - - Transpose4times4x4 v16, v17 - - stvx v0, 0, r5 - stvx v1, r8, r5 - stvx v2, 0, r7 - stvx v3, r8, r7 -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -loop_filter_simple_horizontal_edge_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - ;# build constants - lvx v8, 0, r5 ;# flimit - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 - - neg r5, r4 ;# r5 = -1 * stride - add r6, r5, r5 ;# r6 = -2 * stride - - lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge - lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge - lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge - lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge - - vp8_simple_filter - - stvx v1, r5, r3 ;# store P0 - stvx v2, 0, r3 ;# store Q0 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro RLV Offs - stw r0, (\Offs*4)(r5) - lwzux r0, r7, r4 -.endm - -.macro WLV Offs - lwz r0, (\Offs*4)(r5) - stwux r0, r7, r4 -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -loop_filter_simple_vertical_edge_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - ;# build constants - lvx v8, 0, r5 ;# flimit - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 - - la r5, -96(r1) ;# temporary space for reading in vectors - - ;# Store 4 pels at word "Offs" in temp array, then advance r7 - ;# to next row and read another 4 pels from the frame buffer. - - subi r7, r3, 2 ;# r7 -> 2 pels before start - lwzx r0, 0, r7 ;# read first 4 pels - - ;# 16 unaligned word accesses - RLV 0 - RLV 4 - RLV 8 - RLV 12 - RLV 1 - RLV 5 - RLV 9 - RLV 13 - RLV 2 - RLV 6 - RLV 10 - RLV 14 - RLV 3 - RLV 7 - RLV 11 - - stw r0, (15*4)(r5) ;# write last 4 pels - - simple_vertical - - ;# Read temp array, write frame buffer. - subi r7, r3, 2 ;# r7 -> 2 pels before start - lwzx r0, 0, r5 ;# read/write first 4 pels - stwx r0, 0, r7 - - WLV 4 - WLV 8 - WLV 12 - WLV 1 - WLV 5 - WLV 9 - WLV 13 - WLV 2 - WLV 6 - WLV 10 - WLV 14 - WLV 3 - WLV 7 - WLV 11 - WLV 15 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - -_chromaSelectors: - .long _B_hihi - .long _B_Ures0 - .long _B_Vres0 - .long 0 - .long _B_lolo - .long _B_Ures8 - .long _B_Vres8 - .long 0 - - .align 4 -_B_Vres8: - .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 - - .align 4 -_B_Ures8: - .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 - - .align 4 -_B_lolo: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 - - .align 4 -_B_Vres0: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 - .align 4 -_B_Ures0: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 - - .align 4 -_B_hihi: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp9/common/ppc/vp9_platform_altivec.asm b/vp9/common/ppc/vp9_platform_altivec.asm deleted file mode 100644 index f81d86f..0000000 --- a/vp9/common/ppc/vp9_platform_altivec.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl save_platform_context - .globl restore_platform_context - -.macro W V P - stvx \V, 0, \P - addi \P, \P, 16 -.endm - -.macro R V P - lvx \V, 0, \P - addi \P, \P, 16 -.endm - -;# r3 context_ptr - .align 2 -save_platform_contex: - W v20, r3 - W v21, r3 - W v22, r3 - W v23, r3 - W v24, r3 - W v25, r3 - W v26, r3 - W v27, r3 - W v28, r3 - W v29, r3 - W v30, r3 - W v31, r3 - - blr - -;# r3 context_ptr - .align 2 -restore_platform_context: - R v20, r3 - R v21, r3 - R v22, r3 - R v23, r3 - R v24, r3 - R v25, r3 - R v26, r3 - R v27, r3 - R v28, r3 - R v29, r3 - R v30, r3 - R v31, r3 - - blr diff --git a/vp9/common/ppc/vp9_recon_altivec.asm b/vp9/common/ppc/vp9_recon_altivec.asm deleted file mode 100644 index dd39e05..0000000 --- a/vp9/common/ppc/vp9_recon_altivec.asm +++ /dev/null @@ -1,175 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl recon4b_ppc - .globl recon2b_ppc - .globl recon_b_ppc - -.macro row_of16 Diff Pred Dst Stride - lvx v1, 0, \Pred ;# v1 = pred = p0..p15 - addi \Pred, \Pred, 16 ;# next pred - vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 - lvx v3, 0, \Diff ;# v3 = d0..d7 - vaddshs v2, v2, v3 ;# v2 = r0..r7 - vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 - lvx v3, r8, \Diff ;# v3 = d8..d15 - addi \Diff, \Diff, 32 ;# next diff - vaddshs v3, v3, v1 ;# v3 = r8..r15 - vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15 - stvx v2, 0, \Dst ;# to dst - add \Dst, \Dst, \Stride ;# next dst -.endm - - .text - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride -recon4b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - li r8, 16 - - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr - -.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels - lvx v1, 0, \Pred ;# v1 = pred = p0..p15 - vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 - lvx v3, 0, \Diff ;# v3 = d0..d7 - vaddshs v2, v2, v3 ;# v2 = r0..r7 - vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 - lvx v3, r8, \Diff ;# v2 = d8..d15 - vaddshs v3, v3, v1 ;# v3 = r8..r15 - vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15 - stvx v2, 0, r10 ;# 2 rows to dst from buf - lwz r0, 0(r10) -.if \write_first_four_pels - stw r0, 0(\Dst) - .else - stwux r0, \Dst, \Stride -.endif - lwz r0, 4(r10) - stw r0, 4(\Dst) - lwz r0, 8(r10) - stwux r0, \Dst, \Stride ;# advance dst to next row - lwz r0, 12(r10) - stw r0, 4(\Dst) -.endm - - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride - -recon2b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - li r8, 16 - - la r10, -48(r1) ;# buf - - two_rows_of8 r3, r4, r5, r6, 1 - - addi r4, r4, 16; ;# next pred - addi r3, r3, 32; ;# next diff - - two_rows_of8 r3, r4, r5, r6, 0 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr - -.macro get_two_diff_rows - stw r0, 0(r10) - lwz r0, 4(r3) - stw r0, 4(r10) - lwzu r0, 32(r3) - stw r0, 8(r10) - lwz r0, 4(r3) - stw r0, 12(r10) - lvx v3, 0, r10 -.endm - - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride -recon_b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - - la r10, -48(r1) ;# buf - - lwz r0, 0(r4) - stw r0, 0(r10) - lwz r0, 16(r4) - stw r0, 4(r10) - lwz r0, 32(r4) - stw r0, 8(r10) - lwz r0, 48(r4) - stw r0, 12(r10) - - lvx v1, 0, r10; ;# v1 = pred = p0..p15 - - lwz r0, 0(r3) ;# v3 = d0..d7 - - get_two_diff_rows - - vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7 - vaddshs v2, v2, v3; ;# v2 = r0..r7 - - lwzu r0, 32(r3) ;# v3 = d8..d15 - - get_two_diff_rows - - vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15 - vaddshs v3, v3, v1; ;# v3 = r8..r15 - - vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15 - stvx v2, 0, r10; ;# 16 pels to dst from buf - - lwz r0, 0(r10) - stw r0, 0(r5) - lwz r0, 4(r10) - stwux r0, r5, r6 - lwz r0, 8(r10) - stwux r0, r5, r6 - lwz r0, 12(r10) - stwx r0, r5, r6 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c deleted file mode 100644 index a6be550..0000000 --- a/vp9/common/ppc/vp9_systemdependent.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_loopfilter.h" -#include "recon.h" -#include "vp9/common/vp9_onyxc_int.h" - -void (*vp8_short_idct4x4)(short *input, short *output, int pitch); -void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); -void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); - -extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, int cols, int flimit); - -extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, - int rows, int cols, int flimit); -extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, - int rows, int cols, int flimit); -extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, - int rows, int cols, int flimit); -extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, - int rows, int cols, int flimit); -extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, int cols, int flimit); -void vp9_plane_add_noise_c(unsigned char *start, - unsigned int width, unsigned int height, - int pitch, int q, int a); - -extern copy_mem_block_function *vp9_copy_mem16x16; -extern copy_mem_block_function *vp9_copy_mem8x8; -extern copy_mem_block_function *vp9_copy_mem8x4; - -// PPC -extern subpixel_predict_function sixtap_predict_ppc; -extern subpixel_predict_function sixtap_predict8x4_ppc; -extern subpixel_predict_function sixtap_predict8x8_ppc; -extern subpixel_predict_function sixtap_predict16x16_ppc; -extern subpixel_predict_function bilinear_predict4x4_ppc; -extern subpixel_predict_function bilinear_predict8x4_ppc; -extern subpixel_predict_function bilinear_predict8x8_ppc; -extern subpixel_predict_function bilinear_predict16x16_ppc; - -extern copy_mem_block_function copy_mem16x16_ppc; - -void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, - unsigned char *dst_ptr, int stride); -void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, - unsigned char *dst_ptr, int stride); -void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, - unsigned char *dst_ptr, int stride); - -extern void short_idct4x4_ppc(short *input, short *output, int pitch); - -// Generic C -extern subpixel_predict_function vp9_sixtap_predict_c; -extern subpixel_predict_function vp9_sixtap_predict8x4_c; -extern subpixel_predict_function vp9_sixtap_predict8x8_c; -extern subpixel_predict_function vp9_sixtap_predict16x16_c; -extern subpixel_predict_function vp9_bilinear_predict4x4_c; -extern subpixel_predict_function vp9_bilinear_predict8x4_c; -extern subpixel_predict_function vp9_bilinear_predict8x8_c; -extern subpixel_predict_function vp9_bilinear_predict16x16_c; - -extern copy_mem_block_function vp9_copy_mem16x16_c; -extern copy_mem_block_function vp9_copy_mem8x8_c; -extern copy_mem_block_function vp9_copy_mem8x4_c; - -void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, - unsigned char *dst_ptr, int stride); -void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, - unsigned char *dst_ptr, int stride); -void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, - unsigned char *dst_ptr, int stride); - -extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch); -extern void vp9_short_idct4x4_c(short *input, short *output, int pitch); -extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); - -// PPC -extern loop_filter_block_function loop_filter_mbv_ppc; -extern loop_filter_block_function loop_filter_bv_ppc; -extern loop_filter_block_function loop_filter_mbh_ppc; -extern loop_filter_block_function loop_filter_bh_ppc; - -extern loop_filter_block_function loop_filter_mbvs_ppc; -extern loop_filter_block_function loop_filter_bvs_ppc; -extern loop_filter_block_function loop_filter_mbhs_ppc; -extern loop_filter_block_function loop_filter_bhs_ppc; - -// Generic C -extern loop_filter_block_function vp9_loop_filter_mbv_c; -extern loop_filter_block_function vp9_loop_filter_bv_c; -extern loop_filter_block_function vp9_loop_filter_mbh_c; -extern loop_filter_block_function vp9_loop_filter_bh_c; - -extern loop_filter_block_function vp9_loop_filter_mbvs_c; -extern loop_filter_block_function vp9_loop_filter_bvs_c; -extern loop_filter_block_function vp9_loop_filter_mbhs_c; -extern loop_filter_block_function vp9_loop_filter_bhs_c; - -extern loop_filter_block_function *vp8_lf_mbvfull; -extern loop_filter_block_function *vp8_lf_mbhfull; -extern loop_filter_block_function *vp8_lf_bvfull; -extern loop_filter_block_function *vp8_lf_bhfull; - -extern loop_filter_block_function *vp8_lf_mbvsimple; -extern loop_filter_block_function *vp8_lf_mbhsimple; -extern loop_filter_block_function *vp8_lf_bvsimple; -extern loop_filter_block_function *vp8_lf_bhsimple; - -void vp9_clear_c(void) { -} - -void vp9_machine_specific_config(void) { - // Pure C: - vp9_clear_system_state = vp9_clear_c; - vp9_recon_b = vp9_recon_b_c; - vp9_recon4b = vp9_recon4b_c; - vp9_recon2b = vp9_recon2b_c; - - vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc; - vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc; - vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc; - vp8_bilinear_predict = bilinear_predict4x4_ppc; - - vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc; - vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc; - vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc; - vp9_sixtap_predict = sixtap_predict_ppc; - - vp8_short_idct4x4_1 = vp9_short_idct4x4_1_c; - vp8_short_idct4x4 = short_idct4x4_ppc; - vp8_dc_only_idct = vp8_dc_only_idct_c; - - vp8_lf_mbvfull = loop_filter_mbv_ppc; - vp8_lf_bvfull = loop_filter_bv_ppc; - vp8_lf_mbhfull = loop_filter_mbh_ppc; - vp8_lf_bhfull = loop_filter_bh_ppc; - - vp8_lf_mbvsimple = loop_filter_mbvs_ppc; - vp8_lf_bvsimple = loop_filter_bvs_ppc; - vp8_lf_mbhsimple = loop_filter_mbhs_ppc; - vp8_lf_bhsimple = loop_filter_bhs_ppc; - - vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c; - vp9_mbpost_proc_down = vp9_mbpost_proc_down_c; - vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c; - vp9_plane_add_noise = vp9_plane_add_noise_c; - - vp9_copy_mem16x16 = copy_mem16x16_ppc; - vp9_copy_mem8x8 = vp9_copy_mem8x8_c; - vp9_copy_mem8x4 = vp9_copy_mem8x4_c; - -} diff --git a/vp9/encoder/ppc/vp9_csystemdependent.c b/vp9/encoder/ppc/vp9_csystemdependent.c deleted file mode 100644 index cc67625..0000000 --- a/vp9/encoder/ppc/vp9_csystemdependent.c +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_onyx_int.h" - -SADFunction *vp9_sad16x16; -SADFunction *vp9_sad16x8; -SADFunction *vp9_sad8x16; -SADFunction *vp9_sad8x8; -SADFunction *vp9_sad4x4; - -variance_function *vp9_variance4x4; -variance_function *vp9_variance8x8; -variance_function *vp9_variance8x16; -variance_function *vp9_variance16x8; -variance_function *vp9_variance16x16; - -variance_function *vp9_mse16x16; - -sub_pixel_variance_function *vp9_sub_pixel_variance4x4; -sub_pixel_variance_function *vp9_sub_pixel_variance8x8; -sub_pixel_variance_function *vp9_sub_pixel_variance8x16; -sub_pixel_variance_function *vp9_sub_pixel_variance16x8; -sub_pixel_variance_function *vp9_sub_pixel_variance16x16; - -int (*vp9_block_error)(short *coeff, short *dqcoeff); -int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc); - -int (*vp9_mbuverror)(MACROBLOCK *mb); -unsigned int (*vp9_get_mb_ss)(short *); -void (*vp9_short_fdct4x4)(short *input, short *output, int pitch); -void (*vp9_short_fdct8x4)(short *input, short *output, int pitch); -void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); -void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); -void (*short_walsh4x4)(short *input, short *output, int pitch); - -void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch); -void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride); -void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); - -// c imports -extern int block_error_c(short *coeff, short *dqcoeff); -extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc); - -extern int vp9_mbuverror_c(MACROBLOCK *mb); -extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern void short_fdct4x4_c(short *input, short *output, int pitch); -extern void short_fdct8x4_c(short *input, short *output, int pitch); -extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch); - -extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch); -extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride); -extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); - -extern SADFunction sad16x16_c; -extern SADFunction sad16x8_c; -extern SADFunction sad8x16_c; -extern SADFunction sad8x8_c; -extern SADFunction sad4x4_c; - -extern variance_function variance16x16_c; -extern variance_function variance8x16_c; -extern variance_function variance16x8_c; -extern variance_function variance8x8_c; -extern variance_function variance4x4_c; -extern variance_function mse16x16_c; - -extern sub_pixel_variance_function sub_pixel_variance4x4_c; -extern sub_pixel_variance_function sub_pixel_variance8x8_c; -extern sub_pixel_variance_function sub_pixel_variance8x16_c; -extern sub_pixel_variance_function sub_pixel_variance16x8_c; -extern sub_pixel_variance_function sub_pixel_variance16x16_c; - -extern unsigned int vp9_get_mb_ss_c(short *); - -// ppc -extern int vp9_block_error_ppc(short *coeff, short *dqcoeff); - -extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch); -extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch); - -extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride); -extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); - -extern SADFunction vp9_sad16x16_ppc; -extern SADFunction vp9_sad16x8_ppc; -extern SADFunction vp9_sad8x16_ppc; -extern SADFunction vp9_sad8x8_ppc; -extern SADFunction vp9_sad4x4_ppc; - -extern variance_function vp9_variance16x16_ppc; -extern variance_function vp9_variance8x16_ppc; -extern variance_function vp9_variance16x8_ppc; -extern variance_function vp9_variance8x8_ppc; -extern variance_function vp9_variance4x4_ppc; -extern variance_function vp9_mse16x16_ppc; - -extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc; -extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc; -extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc; -extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc; -extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc; - -extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); - -void vp9_cmachine_specific_config(void) { - // Pure C: - vp9_mbuverror = vp9_mbuverror_c; - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - vp9_short_fdct4x4 = vp9_short_fdct4x4_ppc; - vp9_short_fdct8x4 = vp9_short_fdct8x4_ppc; - vp8_fast_fdct4x4 = vp9_short_fdct4x4_ppc; - vp8_fast_fdct8x4 = vp9_short_fdct8x4_ppc; - short_walsh4x4 = vp9_short_walsh4x4_c; - - vp9_variance4x4 = vp9_variance4x4_ppc; - vp9_variance8x8 = vp9_variance8x8_ppc; - vp9_variance8x16 = vp9_variance8x16_ppc; - vp9_variance16x8 = vp9_variance16x8_ppc; - vp9_variance16x16 = vp9_variance16x16_ppc; - vp9_mse16x16 = vp9_mse16x16_ppc; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ppc; - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ppc; - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ppc; - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ppc; - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ppc; - - vp9_get_mb_ss = vp9_get_mb_ss_c; - - vp9_sad16x16 = vp9_sad16x16_ppc; - vp9_sad16x8 = vp9_sad16x8_ppc; - vp9_sad8x16 = vp9_sad8x16_ppc; - vp9_sad8x8 = vp9_sad8x8_ppc; - vp9_sad4x4 = vp9_sad4x4_ppc; - - vp9_block_error = vp9_block_error_ppc; - vp9_mbblock_error = vp9_mbblock_error_c; - - vp9_subtract_b = vp9_subtract_b_c; - vp9_subtract_mby = vp9_subtract_mby_ppc; - vp9_subtract_mbuv = vp9_subtract_mbuv_ppc; -} diff --git a/vp9/encoder/ppc/vp9_encodemb_altivec.asm b/vp9/encoder/ppc/vp9_encodemb_altivec.asm deleted file mode 100644 index 6e0099d..0000000 --- a/vp9/encoder/ppc/vp9_encodemb_altivec.asm +++ /dev/null @@ -1,153 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_subtract_mbuv_ppc - .globl vp8_subtract_mby_ppc - -;# r3 short *diff -;# r4 unsigned char *usrc -;# r5 unsigned char *vsrc -;# r6 unsigned char *pred -;# r7 int stride -vp8_subtract_mbuv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf000 - mtspr 256, r12 ;# set VRSAVE - - li r9, 256 - add r3, r3, r9 - add r3, r3, r9 - add r6, r6, r9 - - li r10, 16 - li r9, 4 - mtctr r9 - - vspltisw v0, 0 - -mbu_loop: - lvsl v5, 0, r4 ;# permutate value for alignment - lvx v1, 0, r4 ;# src - lvx v2, 0, r6 ;# pred - - add r4, r4, r7 - addi r6, r6, 16 - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrghb v4, v0, v2 ;# unpack high pred to short - - lvsl v5, 0, r4 ;# permutate value for alignment - lvx v1, 0, r4 ;# src - - add r4, r4, r7 - - vsubshs v3, v3, v4 - - stvx v3, 0, r3 ;# store out diff - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrglb v4, v0, v2 ;# unpack high pred to short - - vsubshs v3, v3, v4 - - stvx v3, r10, r3 ;# store out diff - - addi r3, r3, 32 - - bdnz mbu_loop - - mtctr r9 - -mbv_loop: - lvsl v5, 0, r5 ;# permutate value for alignment - lvx v1, 0, r5 ;# src - lvx v2, 0, r6 ;# pred - - add r5, r5, r7 - addi r6, r6, 16 - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrghb v4, v0, v2 ;# unpack high pred to short - - lvsl v5, 0, r5 ;# permutate value for alignment - lvx v1, 0, r5 ;# src - - add r5, r5, r7 - - vsubshs v3, v3, v4 - - stvx v3, 0, r3 ;# store out diff - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrglb v4, v0, v2 ;# unpack high pred to short - - vsubshs v3, v3, v4 - - stvx v3, r10, r3 ;# store out diff - - addi r3, r3, 32 - - bdnz mbv_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# r3 short *diff -;# r4 unsigned char *src -;# r5 unsigned char *pred -;# r6 int stride -vp8_subtract_mby_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf800 - mtspr 256, r12 ;# set VRSAVE - - li r10, 16 - mtctr r10 - - vspltisw v0, 0 - -mby_loop: - lvx v1, 0, r4 ;# src - lvx v2, 0, r5 ;# pred - - add r4, r4, r6 - addi r5, r5, 16 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrghb v4, v0, v2 ;# unpack high pred to short - - vsubshs v3, v3, v4 - - stvx v3, 0, r3 ;# store out diff - - vmrglb v3, v0, v1 ;# unpack low src to short - vmrglb v4, v0, v2 ;# unpack low pred to short - - vsubshs v3, v3, v4 - - stvx v3, r10, r3 ;# store out diff - - addi r3, r3, 32 - - bdnz mby_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp9/encoder/ppc/vp9_fdct_altivec.asm b/vp9/encoder/ppc/vp9_fdct_altivec.asm deleted file mode 100644 index 935d0cb..0000000 --- a/vp9/encoder/ppc/vp9_fdct_altivec.asm +++ /dev/null @@ -1,205 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_short_fdct4x4_ppc - .globl vp8_short_fdct8x4_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -;# Forward and inverse DCTs are nearly identical; only differences are -;# in normalization (fwd is twice unitary, inv is half unitary) -;# and that they are of course transposes of each other. -;# -;# The following three accomplish most of implementation and -;# are used only by ppc_idct.c and ppc_fdct.c. -.macro prologue - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfffc - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - li r6, 16 - - load_c v0, dct_tab, 0, r9, r10 - lvx v1, r6, r10 - addi r10, r10, 32 - lvx v2, 0, r10 - lvx v3, r6, r10 - - load_c v4, ppc_dctperm_tab, 0, r9, r10 - load_c v5, ppc_dctperm_tab, r6, r9, r10 - - load_c v6, round_tab, 0, r10, r9 -.endm - -.macro epilogue - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE -.endm - -;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. -;# a/A are the even rows 0,2 b/B are the odd rows 1,3 -;# For fwd transform, indices are horizontal positions, then frequencies. -;# For inverse transform, frequencies then positions. -;# The two resulting A0..A3 B0..B3 are later combined -;# and vertically transformed. - -.macro two_rows_horiz Dst - vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 - - vmsumshm v10, v0, v8, v6 - vmsumshm v10, v1, v9, v10 - vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 - - vmsumshm v11, v2, v8, v6 - vmsumshm v11, v3, v9, v11 - vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 - - vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 - vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 -.endm - -;# Vertical xf on two rows. DCT values in comments are for inverse transform; -;# forward transform uses transpose. - -.macro two_rows_vert Ceven, Codd - vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times - vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" - vmsumshm v8, v8, v12, v6 - vmsumshm v8, v9, v13, v8 - vsraw v10, v8, v7 - - vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 - vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 - vmsumshm v8, v8, v12, v6 - vmsumshm v8, v9, v13, v8 - vsraw v8, v8, v7 - - vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 -.endm - -.macro two_rows_h Dest - stw r0, 0(r8) - lwz r0, 4(r3) - stw r0, 4(r8) - lwzux r0, r3,r5 - stw r0, 8(r8) - lwz r0, 4(r3) - stw r0, 12(r8) - lvx v8, 0,r8 - two_rows_horiz \Dest -.endm - - .align 2 -;# r3 short *input -;# r4 short *output -;# r5 int pitch -vp8_short_fdct4x4_ppc: - - prologue - - vspltisw v7, 14 ;# == 14, fits in 5 signed bits - addi r8, r1, 0 - - - lwz r0, 0(r3) - two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 - - lwzux r0, r3, r5 - two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 - - lvx v6, r6, r9 ;# v6 = Vround - vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter - - two_rows_vert v0, v1 - stvx v8, 0, r4 - two_rows_vert v2, v3 - stvx v8, r6, r4 - - epilogue - - blr - - .align 2 -;# r3 short *input -;# r4 short *output -;# r5 int pitch -vp8_short_fdct8x4_ppc: - prologue - - vspltisw v7, 14 ;# == 14, fits in 5 signed bits - addi r8, r1, 0 - addi r10, r3, 0 - - lwz r0, 0(r3) - two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 - - lwzux r0, r3, r5 - two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 - - lvx v6, r6, r9 ;# v6 = Vround - vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter - - two_rows_vert v0, v1 - stvx v8, 0, r4 - two_rows_vert v2, v3 - stvx v8, r6, r4 - - ;# Next block - addi r3, r10, 8 - addi r4, r4, 32 - lvx v6, 0, r9 ;# v6 = Hround - - vspltisw v7, 14 ;# == 14, fits in 5 signed bits - addi r8, r1, 0 - - lwz r0, 0(r3) - two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 - - lwzux r0, r3, r5 - two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 - - lvx v6, r6, r9 ;# v6 = Vround - vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter - - two_rows_vert v0, v1 - stvx v8, 0, r4 - two_rows_vert v2, v3 - stvx v8, r6, r4 - - epilogue - - blr - - .data - .align 4 -ppc_dctperm_tab: - .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 - .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 - - .align 4 -dct_tab: - .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 - .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 - - .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 - .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 - - .align 4 -round_tab: - .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) - .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) diff --git a/vp9/encoder/ppc/vp9_rdopt_altivec.asm b/vp9/encoder/ppc/vp9_rdopt_altivec.asm deleted file mode 100644 index ba48230..0000000 --- a/vp9/encoder/ppc/vp9_rdopt_altivec.asm +++ /dev/null @@ -1,51 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_block_error_ppc - - .align 2 -;# r3 short *Coeff -;# r4 short *dqcoeff -vp8_block_error_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf800 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - stw r5, 12(r1) ;# tranfer dc to vector register - - lvx v0, 0, r3 ;# Coeff - lvx v1, 0, r4 ;# dqcoeff - - li r10, 16 - - vspltisw v3, 0 - - vsubshs v0, v0, v1 - - vmsumshm v2, v0, v0, v3 ;# multiply differences - - lvx v0, r10, r3 ;# Coeff - lvx v1, r10, r4 ;# dqcoeff - - vsubshs v0, v0, v1 - - vmsumshm v1, v0, v0, v2 ;# multiply differences - vsumsws v1, v1, v3 ;# sum up - - stvx v1, 0, r1 - lwz r3, 12(r1) ;# return value - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp9/encoder/ppc/vp9_sad_altivec.asm b/vp9/encoder/ppc/vp9_sad_altivec.asm deleted file mode 100644 index e5f2638..0000000 --- a/vp9/encoder/ppc/vp9_sad_altivec.asm +++ /dev/null @@ -1,277 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_sad16x16_ppc - .globl vp8_sad16x8_ppc - .globl vp8_sad8x16_ppc - .globl vp8_sad8x8_ppc - .globl vp8_sad4x4_ppc - -.macro load_aligned_16 V R O - lvsl v3, 0, \R ;# permutate value for alignment - - lvx v1, 0, \R - lvx v2, \O, \R - - vperm \V, v1, v2, v3 -.endm - -.macro prologue - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - li r10, 16 ;# load offset and loop counter - - vspltisw v8, 0 ;# zero out total to start -.endm - -.macro epilogue - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE -.endm - -.macro SAD_16 - ;# v6 = abs (v4 - v5) - vsububs v6, v4, v5 - vsububs v7, v5, v4 - vor v6, v6, v7 - - ;# v8 += abs (v4 - v5) - vsum4ubs v8, v6, v8 -.endm - -.macro sad_16_loop loop_label - lvsl v3, 0, r5 ;# only needs to be done once per block - - ;# preload a line of data before getting into the loop - lvx v4, 0, r3 - lvx v1, 0, r5 - lvx v2, r10, r5 - - add r5, r5, r6 - add r3, r3, r4 - - vperm v5, v1, v2, v3 - - .align 4 -\loop_label: - ;# compute difference on first row - vsububs v6, v4, v5 - vsububs v7, v5, v4 - - ;# load up next set of data - lvx v9, 0, r3 - lvx v1, 0, r5 - lvx v2, r10, r5 - - ;# perform abs() of difference - vor v6, v6, v7 - add r3, r3, r4 - - ;# add to the running tally - vsum4ubs v8, v6, v8 - - ;# now onto the next line - vperm v5, v1, v2, v3 - add r5, r5, r6 - lvx v4, 0, r3 - - ;# compute difference on second row - vsububs v6, v9, v5 - lvx v1, 0, r5 - vsububs v7, v5, v9 - lvx v2, r10, r5 - vor v6, v6, v7 - add r3, r3, r4 - vsum4ubs v8, v6, v8 - vperm v5, v1, v2, v3 - add r5, r5, r6 - - bdnz \loop_label - - vspltisw v7, 0 - - vsumsws v8, v8, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) -.endm - -.macro sad_8_loop loop_label - .align 4 -\loop_label: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - ;# only one of the inputs should need to be aligned. - load_aligned_16 v6, r3, r10 - load_aligned_16 v7, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - vmrghb v4, v4, v6 - vmrghb v5, v5, v7 - - SAD_16 - - bdnz \loop_label - - vspltisw v7, 0 - - vsumsws v8, v8, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad16x16_ppc: - - prologue - - li r9, 8 - mtctr r9 - - sad_16_loop sad16x16_loop - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad16x8_ppc: - - prologue - - li r9, 4 - mtctr r9 - - sad_16_loop sad16x8_loop - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad8x16_ppc: - - prologue - - li r9, 8 - mtctr r9 - - sad_8_loop sad8x16_loop - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad8x8_ppc: - - prologue - - li r9, 4 - mtctr r9 - - sad_8_loop sad8x8_loop - - epilogue - - blr - -.macro transfer_4x4 I P - lwz r0, 0(\I) - add \I, \I, \P - - lwz r7, 0(\I) - add \I, \I, \P - - lwz r8, 0(\I) - add \I, \I, \P - - lwz r9, 0(\I) - - stw r0, 0(r1) - stw r7, 4(r1) - stw r8, 8(r1) - stw r9, 12(r1) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad4x4_ppc: - - prologue - - transfer_4x4 r3, r4 - lvx v4, 0, r1 - - transfer_4x4 r5, r6 - lvx v5, 0, r1 - - vspltisw v8, 0 ;# zero out total to start - - ;# v6 = abs (v4 - v5) - vsububs v6, v4, v5 - vsububs v7, v5, v4 - vor v6, v6, v7 - - ;# v8 += abs (v4 - v5) - vsum4ubs v7, v6, v8 - vsumsws v7, v7, v8 - - stvx v7, 0, r1 - lwz r3, 12(r1) - - epilogue - - blr diff --git a/vp9/encoder/ppc/vp9_variance_altivec.asm b/vp9/encoder/ppc/vp9_variance_altivec.asm deleted file mode 100644 index ad26641..0000000 --- a/vp9/encoder/ppc/vp9_variance_altivec.asm +++ /dev/null @@ -1,375 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_get8x8var_ppc - .globl vp8_get16x16var_ppc - .globl vp8_mse16x16_ppc - .globl vp9_variance16x16_ppc - .globl vp9_variance16x8_ppc - .globl vp9_variance8x16_ppc - .globl vp9_variance8x8_ppc - .globl vp9_variance4x4_ppc - -.macro load_aligned_16 V R O - lvsl v3, 0, \R ;# permutate value for alignment - - lvx v1, 0, \R - lvx v2, \O, \R - - vperm \V, v1, v2, v3 -.endm - -.macro prologue - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - li r10, 16 ;# load offset and loop counter - - vspltisw v7, 0 ;# zero for merging - vspltisw v8, 0 ;# zero out total to start - vspltisw v9, 0 ;# zero out total for dif^2 -.endm - -.macro epilogue - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE -.endm - -.macro compute_sum_sse - ;# Compute sum first. Unpack to so signed subract - ;# can be used. Only have a half word signed - ;# subract. Do high, then low. - vmrghb v2, v7, v4 - vmrghb v3, v7, v5 - vsubshs v2, v2, v3 - vsum4shs v8, v2, v8 - - vmrglb v2, v7, v4 - vmrglb v3, v7, v5 - vsubshs v2, v2, v3 - vsum4shs v8, v2, v8 - - ;# Now compute sse. - vsububs v2, v4, v5 - vsububs v3, v5, v4 - vor v2, v2, v3 - - vmsumubm v9, v2, v2, v9 -.endm - -.macro variance_16 DS loop_label store_sum -\loop_label: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - compute_sum_sse - - bdnz \loop_label - - vsumsws v8, v8, v7 - vsumsws v9, v9, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r4, 12(r1) - -.if \store_sum - stw r3, 0(r8) ;# sum -.endif - stw r4, 0(r7) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, \DS ;# (sum*sum) >> DS - subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) -.endm - -.macro variance_8 DS loop_label store_sum -\loop_label: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - ;# only one of the inputs should need to be aligned. - load_aligned_16 v6, r3, r10 - load_aligned_16 v0, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - vmrghb v4, v4, v6 - vmrghb v5, v5, v0 - - compute_sum_sse - - bdnz \loop_label - - vsumsws v8, v8, v7 - vsumsws v9, v9, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r4, 12(r1) - -.if \store_sum - stw r3, 0(r8) ;# sum -.endif - stw r4, 0(r7) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, \DS ;# (sum*sum) >> 8 - subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *SSE -;# r8 int *Sum -;# -;# r3 return value -vp8_get8x8var_ppc: - - prologue - - li r9, 4 - mtctr r9 - - variance_8 6, get8x8var_loop, 1 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *SSE -;# r8 int *Sum -;# -;# r3 return value -vp8_get16x16var_ppc: - - prologue - - mtctr r10 - - variance_16 8, get16x16var_loop, 1 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r 3 return value -vp8_mse16x16_ppc: - prologue - - mtctr r10 - -mse16x16_loop: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - ;# Now compute sse. - vsububs v2, v4, v5 - vsububs v3, v5, v4 - vor v2, v2, v3 - - vmsumubm v9, v2, v2, v9 - - bdnz mse16x16_loop - - vsumsws v9, v9, v7 - - stvx v9, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r3, 12(r1) - - stw r3, 0(r7) ;# sse - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp9_variance16x16_ppc: - - prologue - - mtctr r10 - - variance_16 8, variance16x16_loop, 0 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp9_variance16x8_ppc: - - prologue - - li r9, 8 - mtctr r9 - - variance_16 7, variance16x8_loop, 0 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp9_variance8x16_ppc: - - prologue - - li r9, 8 - mtctr r9 - - variance_8 7, variance8x16_loop, 0 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp9_variance8x8_ppc: - - prologue - - li r9, 4 - mtctr r9 - - variance_8 6, variance8x8_loop, 0 - - epilogue - - blr - -.macro transfer_4x4 I P - lwz r0, 0(\I) - add \I, \I, \P - - lwz r10,0(\I) - add \I, \I, \P - - lwz r8, 0(\I) - add \I, \I, \P - - lwz r9, 0(\I) - - stw r0, 0(r1) - stw r10, 4(r1) - stw r8, 8(r1) - stw r9, 12(r1) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp9_variance4x4_ppc: - - prologue - - transfer_4x4 r3, r4 - lvx v4, 0, r1 - - transfer_4x4 r5, r6 - lvx v5, 0, r1 - - compute_sum_sse - - vsumsws v8, v8, v7 - vsumsws v9, v9, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r4, 12(r1) - - stw r4, 0(r7) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, 4 ;# (sum*sum) >> 4 - subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) - - epilogue - - blr diff --git a/vp9/encoder/ppc/vp9_variance_subpixel_altivec.asm b/vp9/encoder/ppc/vp9_variance_subpixel_altivec.asm deleted file mode 100644 index 26cc76f..0000000 --- a/vp9/encoder/ppc/vp9_variance_subpixel_altivec.asm +++ /dev/null @@ -1,865 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp9_sub_pixel_variance4x4_ppc - .globl vp9_sub_pixel_variance8x8_ppc - .globl vp9_sub_pixel_variance8x16_ppc - .globl vp9_sub_pixel_variance16x8_ppc - .globl vp9_sub_pixel_variance16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_vfilter V0, V1 - load_c \V0, vfilter_b, r6, r12, r10 - - addi r6, r6, 16 - lvx \V1, r6, r10 -.endm - -.macro HProlog jump_label - ;# load up horizontal filter - slwi. r5, r5, 4 ;# index into horizontal filter array - - ;# index to the next set of vectors in the row. - li r10, 16 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq \jump_label - - load_c v20, hfilter_b, r5, r12, r0 - - ;# setup constants - ;# v14 permutation value for alignment - load_c v28, b_hperm_b, 0, r12, r0 - - ;# index to the next set of vectors in the row. - li r12, 32 - - ;# rounding added in on the multiply - vspltisw v21, 8 - vspltisw v18, 3 - vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 - - slwi. r6, r6, 5 ;# index into vertical filter array -.endm - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# - -.macro hfilter_8 V, hp, lp, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 9 bytes wide, output is 8 bytes. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - - vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 - vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A - - vmsummbm v24, v20, v24, v18 - vmsummbm v25, v20, v25, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - - vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result -.endm - -.macro vfilter_16 P0 P1 - vmuleub v22, \P0, v20 ;# 64 + 4 positive taps - vadduhm v22, v18, v22 - vmuloub v23, \P0, v20 - vadduhm v23, v18, v23 - - vmuleub v24, \P1, v21 - vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary - vmuloub v25, \P1, v21 - vadduhm v23, v23, v25 ;# Ro = odds - - vsrh v22, v22, v19 ;# divide by 128 - vsrh v23, v23, v19 ;# v16 v17 = evens, odds - vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order - vmrglh v23, v22, v23 - vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result -.endm - -.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 - ;# Compute sum first. Unpack to so signed subract - ;# can be used. Only have a half word signed - ;# subract. Do high, then low. - vmrghb \t1, \z0, \src - vmrghb \t2, \z0, \ref - vsubshs \t1, \t1, \t2 - vsum4shs \sum, \t1, \sum - - vmrglb \t1, \z0, \src - vmrglb \t2, \z0, \ref - vsubshs \t1, \t1, \t2 - vsum4shs \sum, \t1, \sum - - ;# Now compute sse. - vsububs \t1, \src, \ref - vsububs \t2, \ref, \src - vor \t1, \t1, \t2 - - vmsumubm \sse, \t1, \t1, \sse -.endm - -.macro variance_final sum, sse, z0, DS - vsumsws \sum, \sum, \z0 - vsumsws \sse, \sse, \z0 - - stvx \sum, 0, r1 - lwz r3, 12(r1) - - stvx \sse, 0, r1 - lwz r4, 12(r1) - - stw r4, 0(r9) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srawi r3, r3, \DS ;# (sum*sum) >> 8 - subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) -.endm - -.macro compute_sum_sse_16 V, increment_counter - load_and_align_16 v16, r7, r8, \increment_counter - compute_sum_sse \V, v16, v18, v19, v20, v21, v23 -.endm - -.macro load_and_align_16 V, R, P, increment_counter - lvsl v17, 0, \R ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, \R - lvx v22, r10, \R - -.if \increment_counter - add \R, \R, \P -.endif - - vperm \V, v21, v22, v17 -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp9_sub_pixel_variance4x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_4x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r12, r0 - load_c v11, b_4567_b, 0, r12, r0 - - hfilter_8 v0, v10, v11, 1 - hfilter_8 v1, v10, v11, 1 - hfilter_8 v2, v10, v11, 1 - hfilter_8 v3, v10, v11, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_4x4_b - - hfilter_8 v4, v10, v11, 0 - - b second_pass_4x4_b - -second_pass_4x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 0 - -second_pass_4x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -compute_sum_sse_4x4_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - load_and_align_16 v4, r7, r8, 1 - load_and_align_16 v5, r7, r8, 1 - load_and_align_16 v6, r7, r8, 1 - load_and_align_16 v7, r7, r8, 1 - - vmrghb v0, v0, v1 - vmrghb v1, v2, v3 - - vmrghb v2, v4, v5 - vmrghb v3, v6, v7 - - load_c v10, b_hilo_b, 0, r12, r0 - - vperm v0, v0, v1, v10 - vperm v1, v2, v3, v10 - - compute_sum_sse v0, v1, v18, v19, v20, v21, v23 - - variance_final v18, v19, v23, 4 - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp9_sub_pixel_variance8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff0 - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x8_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r12, r0 - load_c v11, b_4567_b, 0, r12, r0 - - hfilter_8 v0, v10, v11, 1 - hfilter_8 v1, v10, v11, 1 - hfilter_8 v2, v10, v11, 1 - hfilter_8 v3, v10, v11, 1 - hfilter_8 v4, v10, v11, 1 - hfilter_8 v5, v10, v11, 1 - hfilter_8 v6, v10, v11, 1 - hfilter_8 v7, v10, v11, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_8x8_b - - hfilter_8 v8, v10, v11, 0 - - b second_pass_8x8_b - -second_pass_8x8_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 0 - - beq compute_sum_sse_8x8_b - -second_pass_8x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -compute_sum_sse_8x8_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - vmrghb v0, v0, v1 - vmrghb v1, v2, v3 - vmrghb v2, v4, v5 - vmrghb v3, v6, v7 - - load_and_align_16 v4, r7, r8, 1 - load_and_align_16 v5, r7, r8, 1 - load_and_align_16 v6, r7, r8, 1 - load_and_align_16 v7, r7, r8, 1 - load_and_align_16 v8, r7, r8, 1 - load_and_align_16 v9, r7, r8, 1 - load_and_align_16 v10, r7, r8, 1 - load_and_align_16 v11, r7, r8, 0 - - vmrghb v4, v4, v5 - vmrghb v5, v6, v7 - vmrghb v6, v8, v9 - vmrghb v7, v10, v11 - - compute_sum_sse v0, v4, v18, v19, v20, v21, v23 - compute_sum_sse v1, v5, v18, v19, v20, v21, v23 - compute_sum_sse v2, v6, v18, v19, v20, v21, v23 - compute_sum_sse v3, v7, v18, v19, v20, v21, v23 - - variance_final v18, v19, v23, 6 - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp9_sub_pixel_variance8x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfffc - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x16_pre_copy_b - - ;# Load up permutation constants - load_c v29, b_0123_b, 0, r12, r0 - load_c v30, b_4567_b, 0, r12, r0 - - hfilter_8 v0, v29, v30, 1 - hfilter_8 v1, v29, v30, 1 - hfilter_8 v2, v29, v30, 1 - hfilter_8 v3, v29, v30, 1 - hfilter_8 v4, v29, v30, 1 - hfilter_8 v5, v29, v30, 1 - hfilter_8 v6, v29, v30, 1 - hfilter_8 v7, v29, v30, 1 - hfilter_8 v8, v29, v30, 1 - hfilter_8 v9, v29, v30, 1 - hfilter_8 v10, v29, v30, 1 - hfilter_8 v11, v29, v30, 1 - hfilter_8 v12, v29, v30, 1 - hfilter_8 v13, v29, v30, 1 - hfilter_8 v14, v29, v30, 1 - hfilter_8 v15, v29, v30, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_8x16_b - - hfilter_8 v16, v29, v30, 0 - - b second_pass_8x16_b - -second_pass_8x16_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 1 - load_and_align_16 v9, r3, r4, 1 - load_and_align_16 v10, r3, r4, 1 - load_and_align_16 v11, r3, r4, 1 - load_and_align_16 v12, r3, r4, 1 - load_and_align_16 v13, r3, r4, 1 - load_and_align_16 v14, r3, r4, 1 - load_and_align_16 v15, r3, r4, 1 - load_and_align_16 v16, r3, r4, 0 - - beq compute_sum_sse_8x16_b - -second_pass_8x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -compute_sum_sse_8x16_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - vmrghb v0, v0, v1 - vmrghb v1, v2, v3 - vmrghb v2, v4, v5 - vmrghb v3, v6, v7 - vmrghb v4, v8, v9 - vmrghb v5, v10, v11 - vmrghb v6, v12, v13 - vmrghb v7, v14, v15 - - load_and_align_16 v8, r7, r8, 1 - load_and_align_16 v9, r7, r8, 1 - load_and_align_16 v10, r7, r8, 1 - load_and_align_16 v11, r7, r8, 1 - load_and_align_16 v12, r7, r8, 1 - load_and_align_16 v13, r7, r8, 1 - load_and_align_16 v14, r7, r8, 1 - load_and_align_16 v15, r7, r8, 1 - - vmrghb v8, v8, v9 - vmrghb v9, v10, v11 - vmrghb v10, v12, v13 - vmrghb v11, v14, v15 - - compute_sum_sse v0, v8, v18, v19, v20, v21, v23 - compute_sum_sse v1, v9, v18, v19, v20, v21, v23 - compute_sum_sse v2, v10, v18, v19, v20, v21, v23 - compute_sum_sse v3, v11, v18, v19, v20, v21, v23 - - load_and_align_16 v8, r7, r8, 1 - load_and_align_16 v9, r7, r8, 1 - load_and_align_16 v10, r7, r8, 1 - load_and_align_16 v11, r7, r8, 1 - load_and_align_16 v12, r7, r8, 1 - load_and_align_16 v13, r7, r8, 1 - load_and_align_16 v14, r7, r8, 1 - load_and_align_16 v15, r7, r8, 0 - - vmrghb v8, v8, v9 - vmrghb v9, v10, v11 - vmrghb v10, v12, v13 - vmrghb v11, v14, v15 - - compute_sum_sse v4, v8, v18, v19, v20, v21, v23 - compute_sum_sse v5, v9, v18, v19, v20, v21, v23 - compute_sum_sse v6, v10, v18, v19, v20, v21, v23 - compute_sum_sse v7, v11, v18, v19, v20, v21, v23 - - variance_final v18, v19, v23, 7 - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - blr - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro hfilter_16 V, increment_counter - - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - lvx v23, r12, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified - - ;# set 0 - vmsummbm v24, v20, v21, v18 ;# taps times elements - - ;# set 1 - vsldoi v23, v21, v22, 1 - vmsummbm v25, v20, v23, v18 - - ;# set 2 - vsldoi v23, v21, v22, 2 - vmsummbm v26, v20, v23, v18 - - ;# set 3 - vsldoi v23, v21, v22, 3 - vmsummbm v27, v20, v23, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - vsrh v25, v25, v19 - - vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result - vperm \V, \V, v0, v28 ;# \V = correctly-ordered result -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp9_sub_pixel_variance16x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - HProlog second_pass_16x8_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_16x8_b - - hfilter_16 v8, 0 - - b second_pass_16x8_b - -second_pass_16x8_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 1 - - beq compute_sum_sse_16x8_b - -second_pass_16x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -compute_sum_sse_16x8_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - compute_sum_sse_16 v0, 1 - compute_sum_sse_16 v1, 1 - compute_sum_sse_16 v2, 1 - compute_sum_sse_16 v3, 1 - compute_sum_sse_16 v4, 1 - compute_sum_sse_16 v5, 1 - compute_sum_sse_16 v6, 1 - compute_sum_sse_16 v7, 0 - - variance_final v18, v19, v23, 7 - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp9_sub_pixel_variance16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - HProlog second_pass_16x16_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - hfilter_16 v8, 1 - hfilter_16 v9, 1 - hfilter_16 v10, 1 - hfilter_16 v11, 1 - hfilter_16 v12, 1 - hfilter_16 v13, 1 - hfilter_16 v14, 1 - hfilter_16 v15, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_16x16_b - - hfilter_16 v16, 0 - - b second_pass_16x16_b - -second_pass_16x16_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 1 - load_and_align_16 v9, r3, r4, 1 - load_and_align_16 v10, r3, r4, 1 - load_and_align_16 v11, r3, r4, 1 - load_and_align_16 v12, r3, r4, 1 - load_and_align_16 v13, r3, r4, 1 - load_and_align_16 v14, r3, r4, 1 - load_and_align_16 v15, r3, r4, 1 - load_and_align_16 v16, r3, r4, 0 - - beq compute_sum_sse_16x16_b - -second_pass_16x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -compute_sum_sse_16x16_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - compute_sum_sse_16 v0, 1 - compute_sum_sse_16 v1, 1 - compute_sum_sse_16 v2, 1 - compute_sum_sse_16 v3, 1 - compute_sum_sse_16 v4, 1 - compute_sum_sse_16 v5, 1 - compute_sum_sse_16 v6, 1 - compute_sum_sse_16 v7, 1 - compute_sum_sse_16 v8, 1 - compute_sum_sse_16 v9, 1 - compute_sum_sse_16 v10, 1 - compute_sum_sse_16 v11, 1 - compute_sum_sse_16 v12, 1 - compute_sum_sse_16 v13, 1 - compute_sum_sse_16 v14, 1 - compute_sum_sse_16 v15, 0 - - variance_final v18, v19, v23, 8 - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -hfilter_b: - .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 - .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 - .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 - .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 - .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 - .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 - .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 - .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 - - .align 4 -vfilter_b: - .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - - .align 4 -b_hperm_b: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -b_0123_b: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -b_4567_b: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - -b_hilo_b: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23