Merge "Remove unused vp9 ppc files" into experimental
This commit is contained in:
Коммит
4ba74ae81a
|
@ -1,47 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl copy_mem16x16_ppc
|
||||
|
||||
;# r3 unsigned char *src
|
||||
;# r4 int src_stride
|
||||
;# r5 unsigned char *dst
|
||||
;# r6 int dst_stride
|
||||
|
||||
;# Make the assumption that input will not be aligned,
|
||||
;# but the output will be. So two reads and a perm
|
||||
;# for the input, but only one store for the output.
|
||||
copy_mem16x16_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xe000
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
li r10, 16
|
||||
mtctr r10
|
||||
|
||||
cp_16x16_loop:
|
||||
lvsl v0, 0, r3 ;# permutate value for alignment
|
||||
|
||||
lvx v1, 0, r3
|
||||
lvx v2, r10, r3
|
||||
|
||||
vperm v1, v1, v2, v0
|
||||
|
||||
stvx v1, 0, r5
|
||||
|
||||
add r3, r3, r4 ;# increment source pointer
|
||||
add r5, r5, r6 ;# increment destination pointer
|
||||
|
||||
bdnz cp_16x16_loop
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,677 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl bilinear_predict4x4_ppc
|
||||
.globl bilinear_predict8x4_ppc
|
||||
.globl bilinear_predict8x8_ppc
|
||||
.globl bilinear_predict16x16_ppc
|
||||
|
||||
.macro load_c V, LABEL, OFF, R0, R1
|
||||
lis \R0, \LABEL@ha
|
||||
la \R1, \LABEL@l(\R0)
|
||||
lvx \V, \OFF, \R1
|
||||
.endm
|
||||
|
||||
.macro load_vfilter V0, V1
|
||||
load_c \V0, vfilter_b, r6, r9, r10
|
||||
|
||||
addi r6, r6, 16
|
||||
lvx \V1, r6, r10
|
||||
.endm
|
||||
|
||||
.macro HProlog jump_label
|
||||
;# load up horizontal filter
|
||||
slwi. r5, r5, 4 ;# index into horizontal filter array
|
||||
|
||||
;# index to the next set of vectors in the row.
|
||||
li r10, 16
|
||||
li r12, 32
|
||||
|
||||
;# downshift by 7 ( divide by 128 ) at the end
|
||||
vspltish v19, 7
|
||||
|
||||
;# If there isn't any filtering to be done for the horizontal, then
|
||||
;# just skip to the second pass.
|
||||
beq \jump_label
|
||||
|
||||
load_c v20, hfilter_b, r5, r9, r0
|
||||
|
||||
;# setup constants
|
||||
;# v14 permutation value for alignment
|
||||
load_c v28, b_hperm_b, 0, r9, r0
|
||||
|
||||
;# rounding added in on the multiply
|
||||
vspltisw v21, 8
|
||||
vspltisw v18, 3
|
||||
vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
|
||||
|
||||
slwi. r6, r6, 5 ;# index into vertical filter array
|
||||
.endm
|
||||
|
||||
;# Filters a horizontal line
|
||||
;# expects:
|
||||
;# r3 src_ptr
|
||||
;# r4 pitch
|
||||
;# r10 16
|
||||
;# r12 32
|
||||
;# v17 perm intput
|
||||
;# v18 rounding
|
||||
;# v19 shift
|
||||
;# v20 filter taps
|
||||
;# v21 tmp
|
||||
;# v22 tmp
|
||||
;# v23 tmp
|
||||
;# v24 tmp
|
||||
;# v25 tmp
|
||||
;# v26 tmp
|
||||
;# v27 tmp
|
||||
;# v28 perm output
|
||||
;#
|
||||
.macro HFilter V
|
||||
vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
|
||||
vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
|
||||
|
||||
vmsummbm v24, v20, v24, v18
|
||||
vmsummbm v25, v20, v25, v18
|
||||
|
||||
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
|
||||
|
||||
vsrh v24, v24, v19 ;# divide v0, v1 by 128
|
||||
|
||||
vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
|
||||
.endm
|
||||
|
||||
.macro hfilter_8 V, increment_counter
|
||||
lvsl v17, 0, r3 ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 9 bytes wide, output is 8 bytes.
|
||||
lvx v21, 0, r3
|
||||
lvx v22, r10, r3
|
||||
|
||||
.if \increment_counter
|
||||
add r3, r3, r4
|
||||
.endif
|
||||
vperm v21, v21, v22, v17
|
||||
|
||||
HFilter \V
|
||||
.endm
|
||||
|
||||
|
||||
.macro load_and_align_8 V, increment_counter
|
||||
lvsl v17, 0, r3 ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 21 bytes wide, output is 16 bytes.
|
||||
;# input will can span three vectors if not aligned correctly.
|
||||
lvx v21, 0, r3
|
||||
lvx v22, r10, r3
|
||||
|
||||
.if \increment_counter
|
||||
add r3, r3, r4
|
||||
.endif
|
||||
|
||||
vperm \V, v21, v22, v17
|
||||
.endm
|
||||
|
||||
.macro write_aligned_8 V, increment_counter
|
||||
stvx \V, 0, r7
|
||||
|
||||
.if \increment_counter
|
||||
add r7, r7, r8
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro vfilter_16 P0 P1
|
||||
vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
|
||||
vadduhm v22, v18, v22
|
||||
vmuloub v23, \P0, v20
|
||||
vadduhm v23, v18, v23
|
||||
|
||||
vmuleub v24, \P1, v21
|
||||
vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
|
||||
vmuloub v25, \P1, v21
|
||||
vadduhm v23, v23, v25 ;# Ro = odds
|
||||
|
||||
vsrh v22, v22, v19 ;# divide by 128
|
||||
vsrh v23, v23, v19 ;# v16 v17 = evens, odds
|
||||
vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
|
||||
vmrglh v23, v22, v23
|
||||
vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
|
||||
.endm
|
||||
|
||||
|
||||
.macro w_8x8 V, D, R, P
|
||||
stvx \V, 0, r1
|
||||
lwz \R, 0(r1)
|
||||
stw \R, 0(r7)
|
||||
lwz \R, 4(r1)
|
||||
stw \R, 4(r7)
|
||||
add \D, \D, \P
|
||||
.endm
|
||||
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char * src
|
||||
;# r4 int src_pitch
|
||||
;# r5 int x_offset
|
||||
;# r6 int y_offset
|
||||
;# r7 unsigned char * dst
|
||||
;# r8 int dst_pitch
|
||||
bilinear_predict4x4_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xf830
|
||||
ori r12, r12, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_4x4_pre_copy_b
|
||||
|
||||
;# Load up permutation constants
|
||||
load_c v10, b_0123_b, 0, r9, r12
|
||||
load_c v11, b_4567_b, 0, r9, r12
|
||||
|
||||
hfilter_8 v0, 1
|
||||
hfilter_8 v1, 1
|
||||
hfilter_8 v2, 1
|
||||
hfilter_8 v3, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq store_out_4x4_b
|
||||
|
||||
hfilter_8 v4, 0
|
||||
|
||||
b second_pass_4x4_b
|
||||
|
||||
second_pass_4x4_pre_copy_b:
|
||||
slwi r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_8 v0, 1
|
||||
load_and_align_8 v1, 1
|
||||
load_and_align_8 v2, 1
|
||||
load_and_align_8 v3, 1
|
||||
load_and_align_8 v4, 1
|
||||
|
||||
second_pass_4x4_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
|
||||
store_out_4x4_b:
|
||||
|
||||
stvx v0, 0, r1
|
||||
lwz r0, 0(r1)
|
||||
stw r0, 0(r7)
|
||||
add r7, r7, r8
|
||||
|
||||
stvx v1, 0, r1
|
||||
lwz r0, 0(r1)
|
||||
stw r0, 0(r7)
|
||||
add r7, r7, r8
|
||||
|
||||
stvx v2, 0, r1
|
||||
lwz r0, 0(r1)
|
||||
stw r0, 0(r7)
|
||||
add r7, r7, r8
|
||||
|
||||
stvx v3, 0, r1
|
||||
lwz r0, 0(r1)
|
||||
stw r0, 0(r7)
|
||||
|
||||
exit_4x4:
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char * src
|
||||
;# r4 int src_pitch
|
||||
;# r5 int x_offset
|
||||
;# r6 int y_offset
|
||||
;# r7 unsigned char * dst
|
||||
;# r8 int dst_pitch
|
||||
bilinear_predict8x4_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xf830
|
||||
ori r12, r12, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_8x4_pre_copy_b
|
||||
|
||||
;# Load up permutation constants
|
||||
load_c v10, b_0123_b, 0, r9, r12
|
||||
load_c v11, b_4567_b, 0, r9, r12
|
||||
|
||||
hfilter_8 v0, 1
|
||||
hfilter_8 v1, 1
|
||||
hfilter_8 v2, 1
|
||||
hfilter_8 v3, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq store_out_8x4_b
|
||||
|
||||
hfilter_8 v4, 0
|
||||
|
||||
b second_pass_8x4_b
|
||||
|
||||
second_pass_8x4_pre_copy_b:
|
||||
slwi r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_8 v0, 1
|
||||
load_and_align_8 v1, 1
|
||||
load_and_align_8 v2, 1
|
||||
load_and_align_8 v3, 1
|
||||
load_and_align_8 v4, 1
|
||||
|
||||
second_pass_8x4_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
|
||||
store_out_8x4_b:
|
||||
|
||||
cmpi cr0, r8, 8
|
||||
beq cr0, store_aligned_8x4_b
|
||||
|
||||
w_8x8 v0, r7, r0, r8
|
||||
w_8x8 v1, r7, r0, r8
|
||||
w_8x8 v2, r7, r0, r8
|
||||
w_8x8 v3, r7, r0, r8
|
||||
|
||||
b exit_8x4
|
||||
|
||||
store_aligned_8x4_b:
|
||||
load_c v10, b_hilo_b, 0, r9, r10
|
||||
|
||||
vperm v0, v0, v1, v10
|
||||
vperm v2, v2, v3, v10
|
||||
|
||||
stvx v0, 0, r7
|
||||
addi r7, r7, 16
|
||||
stvx v2, 0, r7
|
||||
|
||||
exit_8x4:
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char * src
|
||||
;# r4 int src_pitch
|
||||
;# r5 int x_offset
|
||||
;# r6 int y_offset
|
||||
;# r7 unsigned char * dst
|
||||
;# r8 int dst_pitch
|
||||
bilinear_predict8x8_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xfff0
|
||||
ori r12, r12, 0xffff
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_8x8_pre_copy_b
|
||||
|
||||
;# Load up permutation constants
|
||||
load_c v10, b_0123_b, 0, r9, r12
|
||||
load_c v11, b_4567_b, 0, r9, r12
|
||||
|
||||
hfilter_8 v0, 1
|
||||
hfilter_8 v1, 1
|
||||
hfilter_8 v2, 1
|
||||
hfilter_8 v3, 1
|
||||
hfilter_8 v4, 1
|
||||
hfilter_8 v5, 1
|
||||
hfilter_8 v6, 1
|
||||
hfilter_8 v7, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq store_out_8x8_b
|
||||
|
||||
hfilter_8 v8, 0
|
||||
|
||||
b second_pass_8x8_b
|
||||
|
||||
second_pass_8x8_pre_copy_b:
|
||||
slwi r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_8 v0, 1
|
||||
load_and_align_8 v1, 1
|
||||
load_and_align_8 v2, 1
|
||||
load_and_align_8 v3, 1
|
||||
load_and_align_8 v4, 1
|
||||
load_and_align_8 v5, 1
|
||||
load_and_align_8 v6, 1
|
||||
load_and_align_8 v7, 1
|
||||
load_and_align_8 v8, 0
|
||||
|
||||
second_pass_8x8_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
vfilter_16 v4, v5
|
||||
vfilter_16 v5, v6
|
||||
vfilter_16 v6, v7
|
||||
vfilter_16 v7, v8
|
||||
|
||||
store_out_8x8_b:
|
||||
|
||||
cmpi cr0, r8, 8
|
||||
beq cr0, store_aligned_8x8_b
|
||||
|
||||
w_8x8 v0, r7, r0, r8
|
||||
w_8x8 v1, r7, r0, r8
|
||||
w_8x8 v2, r7, r0, r8
|
||||
w_8x8 v3, r7, r0, r8
|
||||
w_8x8 v4, r7, r0, r8
|
||||
w_8x8 v5, r7, r0, r8
|
||||
w_8x8 v6, r7, r0, r8
|
||||
w_8x8 v7, r7, r0, r8
|
||||
|
||||
b exit_8x8
|
||||
|
||||
store_aligned_8x8_b:
|
||||
load_c v10, b_hilo_b, 0, r9, r10
|
||||
|
||||
vperm v0, v0, v1, v10
|
||||
vperm v2, v2, v3, v10
|
||||
vperm v4, v4, v5, v10
|
||||
vperm v6, v6, v7, v10
|
||||
|
||||
stvx v0, 0, r7
|
||||
addi r7, r7, 16
|
||||
stvx v2, 0, r7
|
||||
addi r7, r7, 16
|
||||
stvx v4, 0, r7
|
||||
addi r7, r7, 16
|
||||
stvx v6, 0, r7
|
||||
|
||||
exit_8x8:
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
;# Filters a horizontal line
|
||||
;# expects:
|
||||
;# r3 src_ptr
|
||||
;# r4 pitch
|
||||
;# r10 16
|
||||
;# r12 32
|
||||
;# v17 perm intput
|
||||
;# v18 rounding
|
||||
;# v19 shift
|
||||
;# v20 filter taps
|
||||
;# v21 tmp
|
||||
;# v22 tmp
|
||||
;# v23 tmp
|
||||
;# v24 tmp
|
||||
;# v25 tmp
|
||||
;# v26 tmp
|
||||
;# v27 tmp
|
||||
;# v28 perm output
|
||||
;#
|
||||
.macro hfilter_16 V, increment_counter
|
||||
|
||||
lvsl v17, 0, r3 ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 21 bytes wide, output is 16 bytes.
|
||||
;# input will can span three vectors if not aligned correctly.
|
||||
lvx v21, 0, r3
|
||||
lvx v22, r10, r3
|
||||
lvx v23, r12, r3
|
||||
|
||||
.if \increment_counter
|
||||
add r3, r3, r4
|
||||
.endif
|
||||
vperm v21, v21, v22, v17
|
||||
vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
|
||||
|
||||
;# set 0
|
||||
vmsummbm v24, v20, v21, v18 ;# taps times elements
|
||||
|
||||
;# set 1
|
||||
vsldoi v23, v21, v22, 1
|
||||
vmsummbm v25, v20, v23, v18
|
||||
|
||||
;# set 2
|
||||
vsldoi v23, v21, v22, 2
|
||||
vmsummbm v26, v20, v23, v18
|
||||
|
||||
;# set 3
|
||||
vsldoi v23, v21, v22, 3
|
||||
vmsummbm v27, v20, v23, v18
|
||||
|
||||
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
|
||||
vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
|
||||
|
||||
vsrh v24, v24, v19 ;# divide v0, v1 by 128
|
||||
vsrh v25, v25, v19
|
||||
|
||||
vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
|
||||
vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
|
||||
.endm
|
||||
|
||||
.macro load_and_align_16 V, increment_counter
|
||||
lvsl v17, 0, r3 ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 21 bytes wide, output is 16 bytes.
|
||||
;# input will can span three vectors if not aligned correctly.
|
||||
lvx v21, 0, r3
|
||||
lvx v22, r10, r3
|
||||
|
||||
.if \increment_counter
|
||||
add r3, r3, r4
|
||||
.endif
|
||||
|
||||
vperm \V, v21, v22, v17
|
||||
.endm
|
||||
|
||||
.macro write_16 V, increment_counter
|
||||
stvx \V, 0, r7
|
||||
|
||||
.if \increment_counter
|
||||
add r7, r7, r8
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char * src
|
||||
;# r4 int src_pitch
|
||||
;# r5 int x_offset
|
||||
;# r6 int y_offset
|
||||
;# r7 unsigned char * dst
|
||||
;# r8 int dst_pitch
|
||||
bilinear_predict16x16_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xffff
|
||||
ori r12, r12, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
HProlog second_pass_16x16_pre_copy_b
|
||||
|
||||
hfilter_16 v0, 1
|
||||
hfilter_16 v1, 1
|
||||
hfilter_16 v2, 1
|
||||
hfilter_16 v3, 1
|
||||
hfilter_16 v4, 1
|
||||
hfilter_16 v5, 1
|
||||
hfilter_16 v6, 1
|
||||
hfilter_16 v7, 1
|
||||
hfilter_16 v8, 1
|
||||
hfilter_16 v9, 1
|
||||
hfilter_16 v10, 1
|
||||
hfilter_16 v11, 1
|
||||
hfilter_16 v12, 1
|
||||
hfilter_16 v13, 1
|
||||
hfilter_16 v14, 1
|
||||
hfilter_16 v15, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq store_out_16x16_b
|
||||
|
||||
hfilter_16 v16, 0
|
||||
|
||||
b second_pass_16x16_b
|
||||
|
||||
second_pass_16x16_pre_copy_b:
|
||||
slwi r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_16 v0, 1
|
||||
load_and_align_16 v1, 1
|
||||
load_and_align_16 v2, 1
|
||||
load_and_align_16 v3, 1
|
||||
load_and_align_16 v4, 1
|
||||
load_and_align_16 v5, 1
|
||||
load_and_align_16 v6, 1
|
||||
load_and_align_16 v7, 1
|
||||
load_and_align_16 v8, 1
|
||||
load_and_align_16 v9, 1
|
||||
load_and_align_16 v10, 1
|
||||
load_and_align_16 v11, 1
|
||||
load_and_align_16 v12, 1
|
||||
load_and_align_16 v13, 1
|
||||
load_and_align_16 v14, 1
|
||||
load_and_align_16 v15, 1
|
||||
load_and_align_16 v16, 0
|
||||
|
||||
second_pass_16x16_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
vfilter_16 v4, v5
|
||||
vfilter_16 v5, v6
|
||||
vfilter_16 v6, v7
|
||||
vfilter_16 v7, v8
|
||||
vfilter_16 v8, v9
|
||||
vfilter_16 v9, v10
|
||||
vfilter_16 v10, v11
|
||||
vfilter_16 v11, v12
|
||||
vfilter_16 v12, v13
|
||||
vfilter_16 v13, v14
|
||||
vfilter_16 v14, v15
|
||||
vfilter_16 v15, v16
|
||||
|
||||
store_out_16x16_b:
|
||||
|
||||
write_16 v0, 1
|
||||
write_16 v1, 1
|
||||
write_16 v2, 1
|
||||
write_16 v3, 1
|
||||
write_16 v4, 1
|
||||
write_16 v5, 1
|
||||
write_16 v6, 1
|
||||
write_16 v7, 1
|
||||
write_16 v8, 1
|
||||
write_16 v9, 1
|
||||
write_16 v10, 1
|
||||
write_16 v11, 1
|
||||
write_16 v12, 1
|
||||
write_16 v13, 1
|
||||
write_16 v14, 1
|
||||
write_16 v15, 0
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.data
|
||||
|
||||
.align 4
|
||||
hfilter_b:
|
||||
.byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
|
||||
.byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
|
||||
.byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
|
||||
.byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
|
||||
.byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
|
||||
.byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
|
||||
.byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
|
||||
.byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
|
||||
|
||||
.align 4
|
||||
vfilter_b:
|
||||
.byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
|
||||
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
|
||||
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
|
||||
|
||||
.align 4
|
||||
b_hperm_b:
|
||||
.byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
|
||||
|
||||
.align 4
|
||||
b_0123_b:
|
||||
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
|
||||
|
||||
.align 4
|
||||
b_4567_b:
|
||||
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
|
||||
|
||||
b_hilo_b:
|
||||
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
|
|
@ -1,189 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl short_idct4x4_ppc
|
||||
|
||||
.macro load_c V, LABEL, OFF, R0, R1
|
||||
lis \R0, \LABEL@ha
|
||||
la \R1, \LABEL@l(\R0)
|
||||
lvx \V, \OFF, \R1
|
||||
.endm
|
||||
|
||||
;# r3 short *input
|
||||
;# r4 short *output
|
||||
;# r5 int pitch
|
||||
.align 2
|
||||
short_idct4x4_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
load_c v8, sinpi8sqrt2, 0, r9, r10
|
||||
load_c v9, cospi8sqrt2minus1, 0, r9, r10
|
||||
load_c v10, hi_hi, 0, r9, r10
|
||||
load_c v11, lo_lo, 0, r9, r10
|
||||
load_c v12, shift_16, 0, r9, r10
|
||||
|
||||
li r10, 16
|
||||
lvx v0, 0, r3 ;# input ip[0], ip[ 4]
|
||||
lvx v1, r10, r3 ;# input ip[8], ip[12]
|
||||
|
||||
;# first pass
|
||||
vupkhsh v2, v0
|
||||
vupkhsh v3, v1
|
||||
vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
|
||||
vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
|
||||
|
||||
vupklsh v0, v0
|
||||
vmulosh v4, v0, v8
|
||||
vsraw v4, v4, v12
|
||||
vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
|
||||
|
||||
vupklsh v1, v1
|
||||
vmulosh v5, v1, v9
|
||||
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
|
||||
vaddsws v5, v5, v1
|
||||
|
||||
vsubsws v4, v4, v5 ;# c1
|
||||
|
||||
vmulosh v3, v1, v8
|
||||
vsraw v3, v3, v12
|
||||
vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
|
||||
|
||||
vmulosh v5, v0, v9
|
||||
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
|
||||
vaddsws v5, v5, v0
|
||||
|
||||
vaddsws v3, v3, v5 ;# d1
|
||||
|
||||
vaddsws v0, v6, v3 ;# a1 + d1
|
||||
vsubsws v3, v6, v3 ;# a1 - d1
|
||||
|
||||
vaddsws v1, v7, v4 ;# b1 + c1
|
||||
vsubsws v2, v7, v4 ;# b1 - c1
|
||||
|
||||
;# transpose input
|
||||
vmrghw v4, v0, v1 ;# a0 b0 a1 b1
|
||||
vmrghw v5, v2, v3 ;# c0 d0 c1 d1
|
||||
|
||||
vmrglw v6, v0, v1 ;# a2 b2 a3 b3
|
||||
vmrglw v7, v2, v3 ;# c2 d2 c3 d3
|
||||
|
||||
vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
|
||||
vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
|
||||
|
||||
vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
|
||||
vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
|
||||
|
||||
;# second pass
|
||||
vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
|
||||
vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
|
||||
|
||||
vmulosh v4, v1, v8
|
||||
vsraw v4, v4, v12
|
||||
vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
|
||||
|
||||
vmulosh v5, v3, v9
|
||||
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
|
||||
vaddsws v5, v5, v3
|
||||
|
||||
vsubsws v4, v4, v5 ;# c1
|
||||
|
||||
vmulosh v2, v3, v8
|
||||
vsraw v2, v2, v12
|
||||
vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
|
||||
|
||||
vmulosh v5, v1, v9
|
||||
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
|
||||
vaddsws v5, v5, v1
|
||||
|
||||
vaddsws v3, v2, v5 ;# d1
|
||||
|
||||
vaddsws v0, v6, v3 ;# a1 + d1
|
||||
vsubsws v3, v6, v3 ;# a1 - d1
|
||||
|
||||
vaddsws v1, v7, v4 ;# b1 + c1
|
||||
vsubsws v2, v7, v4 ;# b1 - c1
|
||||
|
||||
vspltish v6, 4
|
||||
vspltish v7, 3
|
||||
|
||||
vpkswss v0, v0, v1
|
||||
vpkswss v1, v2, v3
|
||||
|
||||
vaddshs v0, v0, v6
|
||||
vaddshs v1, v1, v6
|
||||
|
||||
vsrah v0, v0, v7
|
||||
vsrah v1, v1, v7
|
||||
|
||||
;# transpose output
|
||||
vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
|
||||
vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
|
||||
|
||||
vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
|
||||
vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
|
||||
|
||||
stwu r1,-416(r1) ;# create space on the stack
|
||||
|
||||
stvx v0, 0, r1
|
||||
lwz r6, 0(r1)
|
||||
stw r6, 0(r4)
|
||||
lwz r6, 4(r1)
|
||||
stw r6, 4(r4)
|
||||
|
||||
add r4, r4, r5
|
||||
|
||||
lwz r6, 8(r1)
|
||||
stw r6, 0(r4)
|
||||
lwz r6, 12(r1)
|
||||
stw r6, 4(r4)
|
||||
|
||||
add r4, r4, r5
|
||||
|
||||
stvx v1, 0, r1
|
||||
lwz r6, 0(r1)
|
||||
stw r6, 0(r4)
|
||||
lwz r6, 4(r1)
|
||||
stw r6, 4(r4)
|
||||
|
||||
add r4, r4, r5
|
||||
|
||||
lwz r6, 8(r1)
|
||||
stw r6, 0(r4)
|
||||
lwz r6, 12(r1)
|
||||
stw r6, 4(r4)
|
||||
|
||||
addi r1, r1, 416 ;# recover stack
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.align 4
|
||||
sinpi8sqrt2:
|
||||
.short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
|
||||
|
||||
.align 4
|
||||
cospi8sqrt2minus1:
|
||||
.short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
|
||||
|
||||
.align 4
|
||||
shift_16:
|
||||
.long 16, 16, 16, 16
|
||||
|
||||
.align 4
|
||||
hi_hi:
|
||||
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
|
||||
|
||||
.align 4
|
||||
lo_lo:
|
||||
.byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
|
|
@ -1,127 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vp9/common/vp9_loopfilter.h"
|
||||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
|
||||
typedef void loop_filter_function_y_ppc
|
||||
(
|
||||
unsigned char *s, // source pointer
|
||||
int p, // pitch
|
||||
const signed char *flimit,
|
||||
const signed char *limit,
|
||||
const signed char *thresh
|
||||
);
|
||||
|
||||
typedef void loop_filter_function_uv_ppc
|
||||
(
|
||||
unsigned char *u, // source pointer
|
||||
unsigned char *v, // source pointer
|
||||
int p, // pitch
|
||||
const signed char *flimit,
|
||||
const signed char *limit,
|
||||
const signed char *thresh
|
||||
);
|
||||
|
||||
typedef void loop_filter_function_s_ppc
|
||||
(
|
||||
unsigned char *s, // source pointer
|
||||
int p, // pitch
|
||||
const signed char *flimit
|
||||
);
|
||||
|
||||
loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
|
||||
loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
|
||||
loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
|
||||
loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
|
||||
|
||||
loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
|
||||
loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
|
||||
loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
|
||||
loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
|
||||
|
||||
loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
|
||||
loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
|
||||
|
||||
// Horizontal MB filtering
|
||||
void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
|
||||
|
||||
if (u_ptr)
|
||||
mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
|
||||
}
|
||||
|
||||
void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
(void)u_ptr;
|
||||
(void)v_ptr;
|
||||
(void)uv_stride;
|
||||
loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
|
||||
}
|
||||
|
||||
// Vertical MB Filtering
|
||||
void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
|
||||
|
||||
if (u_ptr)
|
||||
mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
|
||||
}
|
||||
|
||||
void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
(void)u_ptr;
|
||||
(void)v_ptr;
|
||||
(void)uv_stride;
|
||||
loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
|
||||
}
|
||||
|
||||
// Horizontal B Filtering
|
||||
void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
// These should all be done at once with one call, instead of 3
|
||||
loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
|
||||
loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
|
||||
loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
|
||||
|
||||
if (u_ptr)
|
||||
loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
|
||||
}
|
||||
|
||||
void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
(void)u_ptr;
|
||||
(void)v_ptr;
|
||||
(void)uv_stride;
|
||||
loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
|
||||
loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
|
||||
loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
|
||||
}
|
||||
|
||||
// Vertical B Filtering
|
||||
void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
|
||||
|
||||
if (u_ptr)
|
||||
loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
|
||||
}
|
||||
|
||||
void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
(void)u_ptr;
|
||||
(void)v_ptr;
|
||||
(void)uv_stride;
|
||||
loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
|
||||
loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
|
||||
loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,59 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl save_platform_context
|
||||
.globl restore_platform_context
|
||||
|
||||
.macro W V P
|
||||
stvx \V, 0, \P
|
||||
addi \P, \P, 16
|
||||
.endm
|
||||
|
||||
.macro R V P
|
||||
lvx \V, 0, \P
|
||||
addi \P, \P, 16
|
||||
.endm
|
||||
|
||||
;# r3 context_ptr
|
||||
.align 2
|
||||
save_platform_contex:
|
||||
W v20, r3
|
||||
W v21, r3
|
||||
W v22, r3
|
||||
W v23, r3
|
||||
W v24, r3
|
||||
W v25, r3
|
||||
W v26, r3
|
||||
W v27, r3
|
||||
W v28, r3
|
||||
W v29, r3
|
||||
W v30, r3
|
||||
W v31, r3
|
||||
|
||||
blr
|
||||
|
||||
;# r3 context_ptr
|
||||
.align 2
|
||||
restore_platform_context:
|
||||
R v20, r3
|
||||
R v21, r3
|
||||
R v22, r3
|
||||
R v23, r3
|
||||
R v24, r3
|
||||
R v25, r3
|
||||
R v26, r3
|
||||
R v27, r3
|
||||
R v28, r3
|
||||
R v29, r3
|
||||
R v30, r3
|
||||
R v31, r3
|
||||
|
||||
blr
|
|
@ -1,175 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl recon4b_ppc
|
||||
.globl recon2b_ppc
|
||||
.globl recon_b_ppc
|
||||
|
||||
.macro row_of16 Diff Pred Dst Stride
|
||||
lvx v1, 0, \Pred ;# v1 = pred = p0..p15
|
||||
addi \Pred, \Pred, 16 ;# next pred
|
||||
vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
|
||||
lvx v3, 0, \Diff ;# v3 = d0..d7
|
||||
vaddshs v2, v2, v3 ;# v2 = r0..r7
|
||||
vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
|
||||
lvx v3, r8, \Diff ;# v3 = d8..d15
|
||||
addi \Diff, \Diff, 32 ;# next diff
|
||||
vaddshs v3, v3, v1 ;# v3 = r8..r15
|
||||
vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
|
||||
stvx v2, 0, \Dst ;# to dst
|
||||
add \Dst, \Dst, \Stride ;# next dst
|
||||
.endm
|
||||
|
||||
.text
|
||||
.align 2
|
||||
;# r3 = short *diff_ptr,
|
||||
;# r4 = unsigned char *pred_ptr,
|
||||
;# r5 = unsigned char *dst_ptr,
|
||||
;# r6 = int stride
|
||||
recon4b_ppc:
|
||||
mfspr r0, 256 ;# get old VRSAVE
|
||||
stw r0, -8(r1) ;# save old VRSAVE to stack
|
||||
oris r0, r0, 0xf000
|
||||
mtspr 256,r0 ;# set VRSAVE
|
||||
|
||||
vxor v0, v0, v0
|
||||
li r8, 16
|
||||
|
||||
row_of16 r3, r4, r5, r6
|
||||
row_of16 r3, r4, r5, r6
|
||||
row_of16 r3, r4, r5, r6
|
||||
row_of16 r3, r4, r5, r6
|
||||
|
||||
lwz r12, -8(r1) ;# restore old VRSAVE from stack
|
||||
mtspr 256, r12 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
|
||||
lvx v1, 0, \Pred ;# v1 = pred = p0..p15
|
||||
vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
|
||||
lvx v3, 0, \Diff ;# v3 = d0..d7
|
||||
vaddshs v2, v2, v3 ;# v2 = r0..r7
|
||||
vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
|
||||
lvx v3, r8, \Diff ;# v2 = d8..d15
|
||||
vaddshs v3, v3, v1 ;# v3 = r8..r15
|
||||
vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
|
||||
stvx v2, 0, r10 ;# 2 rows to dst from buf
|
||||
lwz r0, 0(r10)
|
||||
.if \write_first_four_pels
|
||||
stw r0, 0(\Dst)
|
||||
.else
|
||||
stwux r0, \Dst, \Stride
|
||||
.endif
|
||||
lwz r0, 4(r10)
|
||||
stw r0, 4(\Dst)
|
||||
lwz r0, 8(r10)
|
||||
stwux r0, \Dst, \Stride ;# advance dst to next row
|
||||
lwz r0, 12(r10)
|
||||
stw r0, 4(\Dst)
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 = short *diff_ptr,
|
||||
;# r4 = unsigned char *pred_ptr,
|
||||
;# r5 = unsigned char *dst_ptr,
|
||||
;# r6 = int stride
|
||||
|
||||
recon2b_ppc:
|
||||
mfspr r0, 256 ;# get old VRSAVE
|
||||
stw r0, -8(r1) ;# save old VRSAVE to stack
|
||||
oris r0, r0, 0xf000
|
||||
mtspr 256,r0 ;# set VRSAVE
|
||||
|
||||
vxor v0, v0, v0
|
||||
li r8, 16
|
||||
|
||||
la r10, -48(r1) ;# buf
|
||||
|
||||
two_rows_of8 r3, r4, r5, r6, 1
|
||||
|
||||
addi r4, r4, 16; ;# next pred
|
||||
addi r3, r3, 32; ;# next diff
|
||||
|
||||
two_rows_of8 r3, r4, r5, r6, 0
|
||||
|
||||
lwz r12, -8(r1) ;# restore old VRSAVE from stack
|
||||
mtspr 256, r12 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.macro get_two_diff_rows
|
||||
stw r0, 0(r10)
|
||||
lwz r0, 4(r3)
|
||||
stw r0, 4(r10)
|
||||
lwzu r0, 32(r3)
|
||||
stw r0, 8(r10)
|
||||
lwz r0, 4(r3)
|
||||
stw r0, 12(r10)
|
||||
lvx v3, 0, r10
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 = short *diff_ptr,
|
||||
;# r4 = unsigned char *pred_ptr,
|
||||
;# r5 = unsigned char *dst_ptr,
|
||||
;# r6 = int stride
|
||||
recon_b_ppc:
|
||||
mfspr r0, 256 ;# get old VRSAVE
|
||||
stw r0, -8(r1) ;# save old VRSAVE to stack
|
||||
oris r0, r0, 0xf000
|
||||
mtspr 256,r0 ;# set VRSAVE
|
||||
|
||||
vxor v0, v0, v0
|
||||
|
||||
la r10, -48(r1) ;# buf
|
||||
|
||||
lwz r0, 0(r4)
|
||||
stw r0, 0(r10)
|
||||
lwz r0, 16(r4)
|
||||
stw r0, 4(r10)
|
||||
lwz r0, 32(r4)
|
||||
stw r0, 8(r10)
|
||||
lwz r0, 48(r4)
|
||||
stw r0, 12(r10)
|
||||
|
||||
lvx v1, 0, r10; ;# v1 = pred = p0..p15
|
||||
|
||||
lwz r0, 0(r3) ;# v3 = d0..d7
|
||||
|
||||
get_two_diff_rows
|
||||
|
||||
vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
|
||||
vaddshs v2, v2, v3; ;# v2 = r0..r7
|
||||
|
||||
lwzu r0, 32(r3) ;# v3 = d8..d15
|
||||
|
||||
get_two_diff_rows
|
||||
|
||||
vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
|
||||
vaddshs v3, v3, v1; ;# v3 = r8..r15
|
||||
|
||||
vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
|
||||
stvx v2, 0, r10; ;# 16 pels to dst from buf
|
||||
|
||||
lwz r0, 0(r10)
|
||||
stw r0, 0(r5)
|
||||
lwz r0, 4(r10)
|
||||
stwux r0, r5, r6
|
||||
lwz r0, 8(r10)
|
||||
stwux r0, r5, r6
|
||||
lwz r0, 12(r10)
|
||||
stwx r0, r5, r6
|
||||
|
||||
lwz r12, -8(r1) ;# restore old VRSAVE from stack
|
||||
mtspr 256, r12 ;# reset old VRSAVE
|
||||
|
||||
blr
|
|
@ -1,167 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vp9/common/vp9_loopfilter.h"
|
||||
#include "recon.h"
|
||||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
|
||||
void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
|
||||
void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
|
||||
void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
|
||||
|
||||
extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows, int cols, int flimit);
|
||||
|
||||
extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch,
|
||||
int rows, int cols, int flimit);
|
||||
extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
|
||||
int rows, int cols, int flimit);
|
||||
extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch,
|
||||
int rows, int cols, int flimit);
|
||||
extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
|
||||
int rows, int cols, int flimit);
|
||||
extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int src_pixels_per_line,
|
||||
int dst_pixels_per_line,
|
||||
int rows, int cols, int flimit);
|
||||
void vp9_plane_add_noise_c(unsigned char *start,
|
||||
unsigned int width, unsigned int height,
|
||||
int pitch, int q, int a);
|
||||
|
||||
extern copy_mem_block_function *vp9_copy_mem16x16;
|
||||
extern copy_mem_block_function *vp9_copy_mem8x8;
|
||||
extern copy_mem_block_function *vp9_copy_mem8x4;
|
||||
|
||||
// PPC
|
||||
extern subpixel_predict_function sixtap_predict_ppc;
|
||||
extern subpixel_predict_function sixtap_predict8x4_ppc;
|
||||
extern subpixel_predict_function sixtap_predict8x8_ppc;
|
||||
extern subpixel_predict_function sixtap_predict16x16_ppc;
|
||||
extern subpixel_predict_function bilinear_predict4x4_ppc;
|
||||
extern subpixel_predict_function bilinear_predict8x4_ppc;
|
||||
extern subpixel_predict_function bilinear_predict8x8_ppc;
|
||||
extern subpixel_predict_function bilinear_predict16x16_ppc;
|
||||
|
||||
extern copy_mem_block_function copy_mem16x16_ppc;
|
||||
|
||||
void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int stride);
|
||||
void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int stride);
|
||||
void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int stride);
|
||||
|
||||
extern void short_idct4x4_ppc(short *input, short *output, int pitch);
|
||||
|
||||
// Generic C
|
||||
extern subpixel_predict_function vp9_sixtap_predict_c;
|
||||
extern subpixel_predict_function vp9_sixtap_predict8x4_c;
|
||||
extern subpixel_predict_function vp9_sixtap_predict8x8_c;
|
||||
extern subpixel_predict_function vp9_sixtap_predict16x16_c;
|
||||
extern subpixel_predict_function vp9_bilinear_predict4x4_c;
|
||||
extern subpixel_predict_function vp9_bilinear_predict8x4_c;
|
||||
extern subpixel_predict_function vp9_bilinear_predict8x8_c;
|
||||
extern subpixel_predict_function vp9_bilinear_predict16x16_c;
|
||||
|
||||
extern copy_mem_block_function vp9_copy_mem16x16_c;
|
||||
extern copy_mem_block_function vp9_copy_mem8x8_c;
|
||||
extern copy_mem_block_function vp9_copy_mem8x4_c;
|
||||
|
||||
void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int stride);
|
||||
void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int stride);
|
||||
void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int stride);
|
||||
|
||||
extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);
|
||||
extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);
|
||||
extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
|
||||
|
||||
// PPC
|
||||
extern loop_filter_block_function loop_filter_mbv_ppc;
|
||||
extern loop_filter_block_function loop_filter_bv_ppc;
|
||||
extern loop_filter_block_function loop_filter_mbh_ppc;
|
||||
extern loop_filter_block_function loop_filter_bh_ppc;
|
||||
|
||||
extern loop_filter_block_function loop_filter_mbvs_ppc;
|
||||
extern loop_filter_block_function loop_filter_bvs_ppc;
|
||||
extern loop_filter_block_function loop_filter_mbhs_ppc;
|
||||
extern loop_filter_block_function loop_filter_bhs_ppc;
|
||||
|
||||
// Generic C
|
||||
extern loop_filter_block_function vp9_loop_filter_mbv_c;
|
||||
extern loop_filter_block_function vp9_loop_filter_bv_c;
|
||||
extern loop_filter_block_function vp9_loop_filter_mbh_c;
|
||||
extern loop_filter_block_function vp9_loop_filter_bh_c;
|
||||
|
||||
extern loop_filter_block_function vp9_loop_filter_mbvs_c;
|
||||
extern loop_filter_block_function vp9_loop_filter_bvs_c;
|
||||
extern loop_filter_block_function vp9_loop_filter_mbhs_c;
|
||||
extern loop_filter_block_function vp9_loop_filter_bhs_c;
|
||||
|
||||
extern loop_filter_block_function *vp8_lf_mbvfull;
|
||||
extern loop_filter_block_function *vp8_lf_mbhfull;
|
||||
extern loop_filter_block_function *vp8_lf_bvfull;
|
||||
extern loop_filter_block_function *vp8_lf_bhfull;
|
||||
|
||||
extern loop_filter_block_function *vp8_lf_mbvsimple;
|
||||
extern loop_filter_block_function *vp8_lf_mbhsimple;
|
||||
extern loop_filter_block_function *vp8_lf_bvsimple;
|
||||
extern loop_filter_block_function *vp8_lf_bhsimple;
|
||||
|
||||
void vp9_clear_c(void) {
|
||||
}
|
||||
|
||||
void vp9_machine_specific_config(void) {
|
||||
// Pure C:
|
||||
vp9_clear_system_state = vp9_clear_c;
|
||||
vp9_recon_b = vp9_recon_b_c;
|
||||
vp9_recon4b = vp9_recon4b_c;
|
||||
vp9_recon2b = vp9_recon2b_c;
|
||||
|
||||
vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc;
|
||||
vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc;
|
||||
vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc;
|
||||
vp8_bilinear_predict = bilinear_predict4x4_ppc;
|
||||
|
||||
vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc;
|
||||
vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc;
|
||||
vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc;
|
||||
vp9_sixtap_predict = sixtap_predict_ppc;
|
||||
|
||||
vp8_short_idct4x4_1 = vp9_short_idct4x4_1_c;
|
||||
vp8_short_idct4x4 = short_idct4x4_ppc;
|
||||
vp8_dc_only_idct = vp8_dc_only_idct_c;
|
||||
|
||||
vp8_lf_mbvfull = loop_filter_mbv_ppc;
|
||||
vp8_lf_bvfull = loop_filter_bv_ppc;
|
||||
vp8_lf_mbhfull = loop_filter_mbh_ppc;
|
||||
vp8_lf_bhfull = loop_filter_bh_ppc;
|
||||
|
||||
vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
|
||||
vp8_lf_bvsimple = loop_filter_bvs_ppc;
|
||||
vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
|
||||
vp8_lf_bhsimple = loop_filter_bhs_ppc;
|
||||
|
||||
vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c;
|
||||
vp9_mbpost_proc_down = vp9_mbpost_proc_down_c;
|
||||
vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c;
|
||||
vp9_plane_add_noise = vp9_plane_add_noise_c;
|
||||
|
||||
vp9_copy_mem16x16 = copy_mem16x16_ppc;
|
||||
vp9_copy_mem8x8 = vp9_copy_mem8x8_c;
|
||||
vp9_copy_mem8x4 = vp9_copy_mem8x4_c;
|
||||
|
||||
}
|
|
@ -1,155 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vp9/encoder/vp9_onyx_int.h"
|
||||
|
||||
SADFunction *vp9_sad16x16;
|
||||
SADFunction *vp9_sad16x8;
|
||||
SADFunction *vp9_sad8x16;
|
||||
SADFunction *vp9_sad8x8;
|
||||
SADFunction *vp9_sad4x4;
|
||||
|
||||
variance_function *vp9_variance4x4;
|
||||
variance_function *vp9_variance8x8;
|
||||
variance_function *vp9_variance8x16;
|
||||
variance_function *vp9_variance16x8;
|
||||
variance_function *vp9_variance16x16;
|
||||
|
||||
variance_function *vp9_mse16x16;
|
||||
|
||||
sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
|
||||
sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
|
||||
sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
|
||||
sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
|
||||
sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
|
||||
|
||||
int (*vp9_block_error)(short *coeff, short *dqcoeff);
|
||||
int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
|
||||
|
||||
int (*vp9_mbuverror)(MACROBLOCK *mb);
|
||||
unsigned int (*vp9_get_mb_ss)(short *);
|
||||
void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
|
||||
void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
|
||||
void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
|
||||
void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
|
||||
void (*short_walsh4x4)(short *input, short *output, int pitch);
|
||||
|
||||
void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
|
||||
void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
|
||||
void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
|
||||
void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
|
||||
|
||||
// c imports
|
||||
extern int block_error_c(short *coeff, short *dqcoeff);
|
||||
extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
|
||||
|
||||
extern int vp9_mbuverror_c(MACROBLOCK *mb);
|
||||
extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
|
||||
extern void short_fdct4x4_c(short *input, short *output, int pitch);
|
||||
extern void short_fdct8x4_c(short *input, short *output, int pitch);
|
||||
extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
|
||||
|
||||
extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
|
||||
extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
|
||||
extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
|
||||
extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
|
||||
|
||||
extern SADFunction sad16x16_c;
|
||||
extern SADFunction sad16x8_c;
|
||||
extern SADFunction sad8x16_c;
|
||||
extern SADFunction sad8x8_c;
|
||||
extern SADFunction sad4x4_c;
|
||||
|
||||
extern variance_function variance16x16_c;
|
||||
extern variance_function variance8x16_c;
|
||||
extern variance_function variance16x8_c;
|
||||
extern variance_function variance8x8_c;
|
||||
extern variance_function variance4x4_c;
|
||||
extern variance_function mse16x16_c;
|
||||
|
||||
extern sub_pixel_variance_function sub_pixel_variance4x4_c;
|
||||
extern sub_pixel_variance_function sub_pixel_variance8x8_c;
|
||||
extern sub_pixel_variance_function sub_pixel_variance8x16_c;
|
||||
extern sub_pixel_variance_function sub_pixel_variance16x8_c;
|
||||
extern sub_pixel_variance_function sub_pixel_variance16x16_c;
|
||||
|
||||
extern unsigned int vp9_get_mb_ss_c(short *);
|
||||
|
||||
// ppc
|
||||
extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
|
||||
|
||||
extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
|
||||
extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
|
||||
|
||||
extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
|
||||
extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
|
||||
|
||||
extern SADFunction vp9_sad16x16_ppc;
|
||||
extern SADFunction vp9_sad16x8_ppc;
|
||||
extern SADFunction vp9_sad8x16_ppc;
|
||||
extern SADFunction vp9_sad8x8_ppc;
|
||||
extern SADFunction vp9_sad4x4_ppc;
|
||||
|
||||
extern variance_function vp9_variance16x16_ppc;
|
||||
extern variance_function vp9_variance8x16_ppc;
|
||||
extern variance_function vp9_variance16x8_ppc;
|
||||
extern variance_function vp9_variance8x8_ppc;
|
||||
extern variance_function vp9_variance4x4_ppc;
|
||||
extern variance_function vp9_mse16x16_ppc;
|
||||
|
||||
extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
|
||||
extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
|
||||
extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
|
||||
extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
|
||||
extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
|
||||
|
||||
extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
|
||||
extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
|
||||
|
||||
void vp9_cmachine_specific_config(void) {
|
||||
// Pure C:
|
||||
vp9_mbuverror = vp9_mbuverror_c;
|
||||
vp8_fast_quantize_b = vp8_fast_quantize_b_c;
|
||||
vp9_short_fdct4x4 = vp9_short_fdct4x4_ppc;
|
||||
vp9_short_fdct8x4 = vp9_short_fdct8x4_ppc;
|
||||
vp8_fast_fdct4x4 = vp9_short_fdct4x4_ppc;
|
||||
vp8_fast_fdct8x4 = vp9_short_fdct8x4_ppc;
|
||||
short_walsh4x4 = vp9_short_walsh4x4_c;
|
||||
|
||||
vp9_variance4x4 = vp9_variance4x4_ppc;
|
||||
vp9_variance8x8 = vp9_variance8x8_ppc;
|
||||
vp9_variance8x16 = vp9_variance8x16_ppc;
|
||||
vp9_variance16x8 = vp9_variance16x8_ppc;
|
||||
vp9_variance16x16 = vp9_variance16x16_ppc;
|
||||
vp9_mse16x16 = vp9_mse16x16_ppc;
|
||||
|
||||
vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ppc;
|
||||
vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ppc;
|
||||
vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ppc;
|
||||
vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ppc;
|
||||
vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ppc;
|
||||
|
||||
vp9_get_mb_ss = vp9_get_mb_ss_c;
|
||||
|
||||
vp9_sad16x16 = vp9_sad16x16_ppc;
|
||||
vp9_sad16x8 = vp9_sad16x8_ppc;
|
||||
vp9_sad8x16 = vp9_sad8x16_ppc;
|
||||
vp9_sad8x8 = vp9_sad8x8_ppc;
|
||||
vp9_sad4x4 = vp9_sad4x4_ppc;
|
||||
|
||||
vp9_block_error = vp9_block_error_ppc;
|
||||
vp9_mbblock_error = vp9_mbblock_error_c;
|
||||
|
||||
vp9_subtract_b = vp9_subtract_b_c;
|
||||
vp9_subtract_mby = vp9_subtract_mby_ppc;
|
||||
vp9_subtract_mbuv = vp9_subtract_mbuv_ppc;
|
||||
}
|
|
@ -1,153 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl vp8_subtract_mbuv_ppc
|
||||
.globl vp8_subtract_mby_ppc
|
||||
|
||||
;# r3 short *diff
|
||||
;# r4 unsigned char *usrc
|
||||
;# r5 unsigned char *vsrc
|
||||
;# r6 unsigned char *pred
|
||||
;# r7 int stride
|
||||
vp8_subtract_mbuv_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xf000
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
li r9, 256
|
||||
add r3, r3, r9
|
||||
add r3, r3, r9
|
||||
add r6, r6, r9
|
||||
|
||||
li r10, 16
|
||||
li r9, 4
|
||||
mtctr r9
|
||||
|
||||
vspltisw v0, 0
|
||||
|
||||
mbu_loop:
|
||||
lvsl v5, 0, r4 ;# permutate value for alignment
|
||||
lvx v1, 0, r4 ;# src
|
||||
lvx v2, 0, r6 ;# pred
|
||||
|
||||
add r4, r4, r7
|
||||
addi r6, r6, 16
|
||||
|
||||
vperm v1, v1, v0, v5
|
||||
|
||||
vmrghb v3, v0, v1 ;# unpack high src to short
|
||||
vmrghb v4, v0, v2 ;# unpack high pred to short
|
||||
|
||||
lvsl v5, 0, r4 ;# permutate value for alignment
|
||||
lvx v1, 0, r4 ;# src
|
||||
|
||||
add r4, r4, r7
|
||||
|
||||
vsubshs v3, v3, v4
|
||||
|
||||
stvx v3, 0, r3 ;# store out diff
|
||||
|
||||
vperm v1, v1, v0, v5
|
||||
|
||||
vmrghb v3, v0, v1 ;# unpack high src to short
|
||||
vmrglb v4, v0, v2 ;# unpack high pred to short
|
||||
|
||||
vsubshs v3, v3, v4
|
||||
|
||||
stvx v3, r10, r3 ;# store out diff
|
||||
|
||||
addi r3, r3, 32
|
||||
|
||||
bdnz mbu_loop
|
||||
|
||||
mtctr r9
|
||||
|
||||
mbv_loop:
|
||||
lvsl v5, 0, r5 ;# permutate value for alignment
|
||||
lvx v1, 0, r5 ;# src
|
||||
lvx v2, 0, r6 ;# pred
|
||||
|
||||
add r5, r5, r7
|
||||
addi r6, r6, 16
|
||||
|
||||
vperm v1, v1, v0, v5
|
||||
|
||||
vmrghb v3, v0, v1 ;# unpack high src to short
|
||||
vmrghb v4, v0, v2 ;# unpack high pred to short
|
||||
|
||||
lvsl v5, 0, r5 ;# permutate value for alignment
|
||||
lvx v1, 0, r5 ;# src
|
||||
|
||||
add r5, r5, r7
|
||||
|
||||
vsubshs v3, v3, v4
|
||||
|
||||
stvx v3, 0, r3 ;# store out diff
|
||||
|
||||
vperm v1, v1, v0, v5
|
||||
|
||||
vmrghb v3, v0, v1 ;# unpack high src to short
|
||||
vmrglb v4, v0, v2 ;# unpack high pred to short
|
||||
|
||||
vsubshs v3, v3, v4
|
||||
|
||||
stvx v3, r10, r3 ;# store out diff
|
||||
|
||||
addi r3, r3, 32
|
||||
|
||||
bdnz mbv_loop
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
;# r3 short *diff
|
||||
;# r4 unsigned char *src
|
||||
;# r5 unsigned char *pred
|
||||
;# r6 int stride
|
||||
vp8_subtract_mby_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xf800
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
li r10, 16
|
||||
mtctr r10
|
||||
|
||||
vspltisw v0, 0
|
||||
|
||||
mby_loop:
|
||||
lvx v1, 0, r4 ;# src
|
||||
lvx v2, 0, r5 ;# pred
|
||||
|
||||
add r4, r4, r6
|
||||
addi r5, r5, 16
|
||||
|
||||
vmrghb v3, v0, v1 ;# unpack high src to short
|
||||
vmrghb v4, v0, v2 ;# unpack high pred to short
|
||||
|
||||
vsubshs v3, v3, v4
|
||||
|
||||
stvx v3, 0, r3 ;# store out diff
|
||||
|
||||
vmrglb v3, v0, v1 ;# unpack low src to short
|
||||
vmrglb v4, v0, v2 ;# unpack low pred to short
|
||||
|
||||
vsubshs v3, v3, v4
|
||||
|
||||
stvx v3, r10, r3 ;# store out diff
|
||||
|
||||
addi r3, r3, 32
|
||||
|
||||
bdnz mby_loop
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
|
@ -1,205 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl vp8_short_fdct4x4_ppc
|
||||
.globl vp8_short_fdct8x4_ppc
|
||||
|
||||
.macro load_c V, LABEL, OFF, R0, R1
|
||||
lis \R0, \LABEL@ha
|
||||
la \R1, \LABEL@l(\R0)
|
||||
lvx \V, \OFF, \R1
|
||||
.endm
|
||||
|
||||
;# Forward and inverse DCTs are nearly identical; only differences are
|
||||
;# in normalization (fwd is twice unitary, inv is half unitary)
|
||||
;# and that they are of course transposes of each other.
|
||||
;#
|
||||
;# The following three accomplish most of implementation and
|
||||
;# are used only by ppc_idct.c and ppc_fdct.c.
|
||||
.macro prologue
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xfffc
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
li r6, 16
|
||||
|
||||
load_c v0, dct_tab, 0, r9, r10
|
||||
lvx v1, r6, r10
|
||||
addi r10, r10, 32
|
||||
lvx v2, 0, r10
|
||||
lvx v3, r6, r10
|
||||
|
||||
load_c v4, ppc_dctperm_tab, 0, r9, r10
|
||||
load_c v5, ppc_dctperm_tab, r6, r9, r10
|
||||
|
||||
load_c v6, round_tab, 0, r10, r9
|
||||
.endm
|
||||
|
||||
.macro epilogue
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
.endm
|
||||
|
||||
;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
|
||||
;# a/A are the even rows 0,2 b/B are the odd rows 1,3
|
||||
;# For fwd transform, indices are horizontal positions, then frequencies.
|
||||
;# For inverse transform, frequencies then positions.
|
||||
;# The two resulting A0..A3 B0..B3 are later combined
|
||||
;# and vertically transformed.
|
||||
|
||||
.macro two_rows_horiz Dst
|
||||
vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
|
||||
|
||||
vmsumshm v10, v0, v8, v6
|
||||
vmsumshm v10, v1, v9, v10
|
||||
vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
|
||||
|
||||
vmsumshm v11, v2, v8, v6
|
||||
vmsumshm v11, v3, v9, v11
|
||||
vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
|
||||
|
||||
vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
|
||||
vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
|
||||
.endm
|
||||
|
||||
;# Vertical xf on two rows. DCT values in comments are for inverse transform;
|
||||
;# forward transform uses transpose.
|
||||
|
||||
.macro two_rows_vert Ceven, Codd
|
||||
vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
|
||||
vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
|
||||
vmsumshm v8, v8, v12, v6
|
||||
vmsumshm v8, v9, v13, v8
|
||||
vsraw v10, v8, v7
|
||||
|
||||
vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
|
||||
vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
|
||||
vmsumshm v8, v8, v12, v6
|
||||
vmsumshm v8, v9, v13, v8
|
||||
vsraw v8, v8, v7
|
||||
|
||||
vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
|
||||
.endm
|
||||
|
||||
.macro two_rows_h Dest
|
||||
stw r0, 0(r8)
|
||||
lwz r0, 4(r3)
|
||||
stw r0, 4(r8)
|
||||
lwzux r0, r3,r5
|
||||
stw r0, 8(r8)
|
||||
lwz r0, 4(r3)
|
||||
stw r0, 12(r8)
|
||||
lvx v8, 0,r8
|
||||
two_rows_horiz \Dest
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 short *input
|
||||
;# r4 short *output
|
||||
;# r5 int pitch
|
||||
vp8_short_fdct4x4_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
vspltisw v7, 14 ;# == 14, fits in 5 signed bits
|
||||
addi r8, r1, 0
|
||||
|
||||
|
||||
lwz r0, 0(r3)
|
||||
two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
|
||||
|
||||
lwzux r0, r3, r5
|
||||
two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
|
||||
|
||||
lvx v6, r6, r9 ;# v6 = Vround
|
||||
vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
|
||||
|
||||
two_rows_vert v0, v1
|
||||
stvx v8, 0, r4
|
||||
two_rows_vert v2, v3
|
||||
stvx v8, r6, r4
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 short *input
|
||||
;# r4 short *output
|
||||
;# r5 int pitch
|
||||
vp8_short_fdct8x4_ppc:
|
||||
prologue
|
||||
|
||||
vspltisw v7, 14 ;# == 14, fits in 5 signed bits
|
||||
addi r8, r1, 0
|
||||
addi r10, r3, 0
|
||||
|
||||
lwz r0, 0(r3)
|
||||
two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
|
||||
|
||||
lwzux r0, r3, r5
|
||||
two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
|
||||
|
||||
lvx v6, r6, r9 ;# v6 = Vround
|
||||
vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
|
||||
|
||||
two_rows_vert v0, v1
|
||||
stvx v8, 0, r4
|
||||
two_rows_vert v2, v3
|
||||
stvx v8, r6, r4
|
||||
|
||||
;# Next block
|
||||
addi r3, r10, 8
|
||||
addi r4, r4, 32
|
||||
lvx v6, 0, r9 ;# v6 = Hround
|
||||
|
||||
vspltisw v7, 14 ;# == 14, fits in 5 signed bits
|
||||
addi r8, r1, 0
|
||||
|
||||
lwz r0, 0(r3)
|
||||
two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
|
||||
|
||||
lwzux r0, r3, r5
|
||||
two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
|
||||
|
||||
lvx v6, r6, r9 ;# v6 = Vround
|
||||
vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
|
||||
|
||||
two_rows_vert v0, v1
|
||||
stvx v8, 0, r4
|
||||
two_rows_vert v2, v3
|
||||
stvx v8, r6, r4
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.data
|
||||
.align 4
|
||||
ppc_dctperm_tab:
|
||||
.byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
|
||||
.byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
|
||||
|
||||
.align 4
|
||||
dct_tab:
|
||||
.short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
|
||||
.short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
|
||||
|
||||
.short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
|
||||
.short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
|
||||
|
||||
.align 4
|
||||
round_tab:
|
||||
.long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
|
||||
.long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
|
|
@ -1,51 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl vp8_block_error_ppc
|
||||
|
||||
.align 2
|
||||
;# r3 short *Coeff
|
||||
;# r4 short *dqcoeff
|
||||
vp8_block_error_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xf800
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
stw r5, 12(r1) ;# tranfer dc to vector register
|
||||
|
||||
lvx v0, 0, r3 ;# Coeff
|
||||
lvx v1, 0, r4 ;# dqcoeff
|
||||
|
||||
li r10, 16
|
||||
|
||||
vspltisw v3, 0
|
||||
|
||||
vsubshs v0, v0, v1
|
||||
|
||||
vmsumshm v2, v0, v0, v3 ;# multiply differences
|
||||
|
||||
lvx v0, r10, r3 ;# Coeff
|
||||
lvx v1, r10, r4 ;# dqcoeff
|
||||
|
||||
vsubshs v0, v0, v1
|
||||
|
||||
vmsumshm v1, v0, v0, v2 ;# multiply differences
|
||||
vsumsws v1, v1, v3 ;# sum up
|
||||
|
||||
stvx v1, 0, r1
|
||||
lwz r3, 12(r1) ;# return value
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
|
@ -1,277 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl vp8_sad16x16_ppc
|
||||
.globl vp8_sad16x8_ppc
|
||||
.globl vp8_sad8x16_ppc
|
||||
.globl vp8_sad8x8_ppc
|
||||
.globl vp8_sad4x4_ppc
|
||||
|
||||
.macro load_aligned_16 V R O
|
||||
lvsl v3, 0, \R ;# permutate value for alignment
|
||||
|
||||
lvx v1, 0, \R
|
||||
lvx v2, \O, \R
|
||||
|
||||
vperm \V, v1, v2, v3
|
||||
.endm
|
||||
|
||||
.macro prologue
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xffc0
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1, -32(r1) ;# create space on the stack
|
||||
|
||||
li r10, 16 ;# load offset and loop counter
|
||||
|
||||
vspltisw v8, 0 ;# zero out total to start
|
||||
.endm
|
||||
|
||||
.macro epilogue
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
.endm
|
||||
|
||||
.macro SAD_16
|
||||
;# v6 = abs (v4 - v5)
|
||||
vsububs v6, v4, v5
|
||||
vsububs v7, v5, v4
|
||||
vor v6, v6, v7
|
||||
|
||||
;# v8 += abs (v4 - v5)
|
||||
vsum4ubs v8, v6, v8
|
||||
.endm
|
||||
|
||||
.macro sad_16_loop loop_label
|
||||
lvsl v3, 0, r5 ;# only needs to be done once per block
|
||||
|
||||
;# preload a line of data before getting into the loop
|
||||
lvx v4, 0, r3
|
||||
lvx v1, 0, r5
|
||||
lvx v2, r10, r5
|
||||
|
||||
add r5, r5, r6
|
||||
add r3, r3, r4
|
||||
|
||||
vperm v5, v1, v2, v3
|
||||
|
||||
.align 4
|
||||
\loop_label:
|
||||
;# compute difference on first row
|
||||
vsububs v6, v4, v5
|
||||
vsububs v7, v5, v4
|
||||
|
||||
;# load up next set of data
|
||||
lvx v9, 0, r3
|
||||
lvx v1, 0, r5
|
||||
lvx v2, r10, r5
|
||||
|
||||
;# perform abs() of difference
|
||||
vor v6, v6, v7
|
||||
add r3, r3, r4
|
||||
|
||||
;# add to the running tally
|
||||
vsum4ubs v8, v6, v8
|
||||
|
||||
;# now onto the next line
|
||||
vperm v5, v1, v2, v3
|
||||
add r5, r5, r6
|
||||
lvx v4, 0, r3
|
||||
|
||||
;# compute difference on second row
|
||||
vsububs v6, v9, v5
|
||||
lvx v1, 0, r5
|
||||
vsububs v7, v5, v9
|
||||
lvx v2, r10, r5
|
||||
vor v6, v6, v7
|
||||
add r3, r3, r4
|
||||
vsum4ubs v8, v6, v8
|
||||
vperm v5, v1, v2, v3
|
||||
add r5, r5, r6
|
||||
|
||||
bdnz \loop_label
|
||||
|
||||
vspltisw v7, 0
|
||||
|
||||
vsumsws v8, v8, v7
|
||||
|
||||
stvx v8, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
.endm
|
||||
|
||||
.macro sad_8_loop loop_label
|
||||
.align 4
|
||||
\loop_label:
|
||||
;# only one of the inputs should need to be aligned.
|
||||
load_aligned_16 v4, r3, r10
|
||||
load_aligned_16 v5, r5, r10
|
||||
|
||||
;# move onto the next line
|
||||
add r3, r3, r4
|
||||
add r5, r5, r6
|
||||
|
||||
;# only one of the inputs should need to be aligned.
|
||||
load_aligned_16 v6, r3, r10
|
||||
load_aligned_16 v7, r5, r10
|
||||
|
||||
;# move onto the next line
|
||||
add r3, r3, r4
|
||||
add r5, r5, r6
|
||||
|
||||
vmrghb v4, v4, v6
|
||||
vmrghb v5, v5, v7
|
||||
|
||||
SAD_16
|
||||
|
||||
bdnz \loop_label
|
||||
|
||||
vspltisw v7, 0
|
||||
|
||||
vsumsws v8, v8, v7
|
||||
|
||||
stvx v8, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int ref_stride
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_sad16x16_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 8
|
||||
mtctr r9
|
||||
|
||||
sad_16_loop sad16x16_loop
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int ref_stride
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_sad16x8_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 4
|
||||
mtctr r9
|
||||
|
||||
sad_16_loop sad16x8_loop
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int ref_stride
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_sad8x16_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 8
|
||||
mtctr r9
|
||||
|
||||
sad_8_loop sad8x16_loop
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int ref_stride
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_sad8x8_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 4
|
||||
mtctr r9
|
||||
|
||||
sad_8_loop sad8x8_loop
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.macro transfer_4x4 I P
|
||||
lwz r0, 0(\I)
|
||||
add \I, \I, \P
|
||||
|
||||
lwz r7, 0(\I)
|
||||
add \I, \I, \P
|
||||
|
||||
lwz r8, 0(\I)
|
||||
add \I, \I, \P
|
||||
|
||||
lwz r9, 0(\I)
|
||||
|
||||
stw r0, 0(r1)
|
||||
stw r7, 4(r1)
|
||||
stw r8, 8(r1)
|
||||
stw r9, 12(r1)
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int ref_stride
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_sad4x4_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
transfer_4x4 r3, r4
|
||||
lvx v4, 0, r1
|
||||
|
||||
transfer_4x4 r5, r6
|
||||
lvx v5, 0, r1
|
||||
|
||||
vspltisw v8, 0 ;# zero out total to start
|
||||
|
||||
;# v6 = abs (v4 - v5)
|
||||
vsububs v6, v4, v5
|
||||
vsububs v7, v5, v4
|
||||
vor v6, v6, v7
|
||||
|
||||
;# v8 += abs (v4 - v5)
|
||||
vsum4ubs v7, v6, v8
|
||||
vsumsws v7, v7, v8
|
||||
|
||||
stvx v7, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
|
@ -1,375 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl vp8_get8x8var_ppc
|
||||
.globl vp8_get16x16var_ppc
|
||||
.globl vp8_mse16x16_ppc
|
||||
.globl vp9_variance16x16_ppc
|
||||
.globl vp9_variance16x8_ppc
|
||||
.globl vp9_variance8x16_ppc
|
||||
.globl vp9_variance8x8_ppc
|
||||
.globl vp9_variance4x4_ppc
|
||||
|
||||
.macro load_aligned_16 V R O
|
||||
lvsl v3, 0, \R ;# permutate value for alignment
|
||||
|
||||
lvx v1, 0, \R
|
||||
lvx v2, \O, \R
|
||||
|
||||
vperm \V, v1, v2, v3
|
||||
.endm
|
||||
|
||||
.macro prologue
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xffc0
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1, -32(r1) ;# create space on the stack
|
||||
|
||||
li r10, 16 ;# load offset and loop counter
|
||||
|
||||
vspltisw v7, 0 ;# zero for merging
|
||||
vspltisw v8, 0 ;# zero out total to start
|
||||
vspltisw v9, 0 ;# zero out total for dif^2
|
||||
.endm
|
||||
|
||||
.macro epilogue
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
.endm
|
||||
|
||||
.macro compute_sum_sse
|
||||
;# Compute sum first. Unpack to so signed subract
|
||||
;# can be used. Only have a half word signed
|
||||
;# subract. Do high, then low.
|
||||
vmrghb v2, v7, v4
|
||||
vmrghb v3, v7, v5
|
||||
vsubshs v2, v2, v3
|
||||
vsum4shs v8, v2, v8
|
||||
|
||||
vmrglb v2, v7, v4
|
||||
vmrglb v3, v7, v5
|
||||
vsubshs v2, v2, v3
|
||||
vsum4shs v8, v2, v8
|
||||
|
||||
;# Now compute sse.
|
||||
vsububs v2, v4, v5
|
||||
vsububs v3, v5, v4
|
||||
vor v2, v2, v3
|
||||
|
||||
vmsumubm v9, v2, v2, v9
|
||||
.endm
|
||||
|
||||
.macro variance_16 DS loop_label store_sum
|
||||
\loop_label:
|
||||
;# only one of the inputs should need to be aligned.
|
||||
load_aligned_16 v4, r3, r10
|
||||
load_aligned_16 v5, r5, r10
|
||||
|
||||
;# move onto the next line
|
||||
add r3, r3, r4
|
||||
add r5, r5, r6
|
||||
|
||||
compute_sum_sse
|
||||
|
||||
bdnz \loop_label
|
||||
|
||||
vsumsws v8, v8, v7
|
||||
vsumsws v9, v9, v7
|
||||
|
||||
stvx v8, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
stvx v9, 0, r1
|
||||
lwz r4, 12(r1)
|
||||
|
||||
.if \store_sum
|
||||
stw r3, 0(r8) ;# sum
|
||||
.endif
|
||||
stw r4, 0(r7) ;# sse
|
||||
|
||||
mullw r3, r3, r3 ;# sum*sum
|
||||
srawi r3, r3, \DS ;# (sum*sum) >> DS
|
||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
|
||||
.endm
|
||||
|
||||
.macro variance_8 DS loop_label store_sum
|
||||
\loop_label:
|
||||
;# only one of the inputs should need to be aligned.
|
||||
load_aligned_16 v4, r3, r10
|
||||
load_aligned_16 v5, r5, r10
|
||||
|
||||
;# move onto the next line
|
||||
add r3, r3, r4
|
||||
add r5, r5, r6
|
||||
|
||||
;# only one of the inputs should need to be aligned.
|
||||
load_aligned_16 v6, r3, r10
|
||||
load_aligned_16 v0, r5, r10
|
||||
|
||||
;# move onto the next line
|
||||
add r3, r3, r4
|
||||
add r5, r5, r6
|
||||
|
||||
vmrghb v4, v4, v6
|
||||
vmrghb v5, v5, v0
|
||||
|
||||
compute_sum_sse
|
||||
|
||||
bdnz \loop_label
|
||||
|
||||
vsumsws v8, v8, v7
|
||||
vsumsws v9, v9, v7
|
||||
|
||||
stvx v8, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
stvx v9, 0, r1
|
||||
lwz r4, 12(r1)
|
||||
|
||||
.if \store_sum
|
||||
stw r3, 0(r8) ;# sum
|
||||
.endif
|
||||
stw r4, 0(r7) ;# sse
|
||||
|
||||
mullw r3, r3, r3 ;# sum*sum
|
||||
srawi r3, r3, \DS ;# (sum*sum) >> 8
|
||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *SSE
|
||||
;# r8 int *Sum
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_get8x8var_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 4
|
||||
mtctr r9
|
||||
|
||||
variance_8 6, get8x8var_loop, 1
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *SSE
|
||||
;# r8 int *Sum
|
||||
;#
|
||||
;# r3 return value
|
||||
vp8_get16x16var_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
mtctr r10
|
||||
|
||||
variance_16 8, get16x16var_loop, 1
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *sse
|
||||
;#
|
||||
;# r 3 return value
|
||||
vp8_mse16x16_ppc:
|
||||
prologue
|
||||
|
||||
mtctr r10
|
||||
|
||||
mse16x16_loop:
|
||||
;# only one of the inputs should need to be aligned.
|
||||
load_aligned_16 v4, r3, r10
|
||||
load_aligned_16 v5, r5, r10
|
||||
|
||||
;# move onto the next line
|
||||
add r3, r3, r4
|
||||
add r5, r5, r6
|
||||
|
||||
;# Now compute sse.
|
||||
vsububs v2, v4, v5
|
||||
vsububs v3, v5, v4
|
||||
vor v2, v2, v3
|
||||
|
||||
vmsumubm v9, v2, v2, v9
|
||||
|
||||
bdnz mse16x16_loop
|
||||
|
||||
vsumsws v9, v9, v7
|
||||
|
||||
stvx v9, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
stvx v9, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
stw r3, 0(r7) ;# sse
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_variance16x16_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
mtctr r10
|
||||
|
||||
variance_16 8, variance16x16_loop, 0
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_variance16x8_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 8
|
||||
mtctr r9
|
||||
|
||||
variance_16 7, variance16x8_loop, 0
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_variance8x16_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 8
|
||||
mtctr r9
|
||||
|
||||
variance_8 7, variance8x16_loop, 0
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_variance8x8_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
li r9, 4
|
||||
mtctr r9
|
||||
|
||||
variance_8 6, variance8x8_loop, 0
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
||||
|
||||
.macro transfer_4x4 I P
|
||||
lwz r0, 0(\I)
|
||||
add \I, \I, \P
|
||||
|
||||
lwz r10,0(\I)
|
||||
add \I, \I, \P
|
||||
|
||||
lwz r8, 0(\I)
|
||||
add \I, \I, \P
|
||||
|
||||
lwz r9, 0(\I)
|
||||
|
||||
stw r0, 0(r1)
|
||||
stw r10, 4(r1)
|
||||
stw r8, 8(r1)
|
||||
stw r9, 12(r1)
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int source_stride
|
||||
;# r5 unsigned char *ref_ptr
|
||||
;# r6 int recon_stride
|
||||
;# r7 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_variance4x4_ppc:
|
||||
|
||||
prologue
|
||||
|
||||
transfer_4x4 r3, r4
|
||||
lvx v4, 0, r1
|
||||
|
||||
transfer_4x4 r5, r6
|
||||
lvx v5, 0, r1
|
||||
|
||||
compute_sum_sse
|
||||
|
||||
vsumsws v8, v8, v7
|
||||
vsumsws v9, v9, v7
|
||||
|
||||
stvx v8, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
stvx v9, 0, r1
|
||||
lwz r4, 12(r1)
|
||||
|
||||
stw r4, 0(r7) ;# sse
|
||||
|
||||
mullw r3, r3, r3 ;# sum*sum
|
||||
srawi r3, r3, 4 ;# (sum*sum) >> 4
|
||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
|
||||
|
||||
epilogue
|
||||
|
||||
blr
|
|
@ -1,865 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
.globl vp9_sub_pixel_variance4x4_ppc
|
||||
.globl vp9_sub_pixel_variance8x8_ppc
|
||||
.globl vp9_sub_pixel_variance8x16_ppc
|
||||
.globl vp9_sub_pixel_variance16x8_ppc
|
||||
.globl vp9_sub_pixel_variance16x16_ppc
|
||||
|
||||
.macro load_c V, LABEL, OFF, R0, R1
|
||||
lis \R0, \LABEL@ha
|
||||
la \R1, \LABEL@l(\R0)
|
||||
lvx \V, \OFF, \R1
|
||||
.endm
|
||||
|
||||
.macro load_vfilter V0, V1
|
||||
load_c \V0, vfilter_b, r6, r12, r10
|
||||
|
||||
addi r6, r6, 16
|
||||
lvx \V1, r6, r10
|
||||
.endm
|
||||
|
||||
.macro HProlog jump_label
|
||||
;# load up horizontal filter
|
||||
slwi. r5, r5, 4 ;# index into horizontal filter array
|
||||
|
||||
;# index to the next set of vectors in the row.
|
||||
li r10, 16
|
||||
|
||||
;# downshift by 7 ( divide by 128 ) at the end
|
||||
vspltish v19, 7
|
||||
|
||||
;# If there isn't any filtering to be done for the horizontal, then
|
||||
;# just skip to the second pass.
|
||||
beq \jump_label
|
||||
|
||||
load_c v20, hfilter_b, r5, r12, r0
|
||||
|
||||
;# setup constants
|
||||
;# v14 permutation value for alignment
|
||||
load_c v28, b_hperm_b, 0, r12, r0
|
||||
|
||||
;# index to the next set of vectors in the row.
|
||||
li r12, 32
|
||||
|
||||
;# rounding added in on the multiply
|
||||
vspltisw v21, 8
|
||||
vspltisw v18, 3
|
||||
vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
|
||||
|
||||
slwi. r6, r6, 5 ;# index into vertical filter array
|
||||
.endm
|
||||
|
||||
;# Filters a horizontal line
|
||||
;# expects:
|
||||
;# r3 src_ptr
|
||||
;# r4 pitch
|
||||
;# r10 16
|
||||
;# r12 32
|
||||
;# v17 perm intput
|
||||
;# v18 rounding
|
||||
;# v19 shift
|
||||
;# v20 filter taps
|
||||
;# v21 tmp
|
||||
;# v22 tmp
|
||||
;# v23 tmp
|
||||
;# v24 tmp
|
||||
;# v25 tmp
|
||||
;# v26 tmp
|
||||
;# v27 tmp
|
||||
;# v28 perm output
|
||||
;#
|
||||
|
||||
.macro hfilter_8 V, hp, lp, increment_counter
|
||||
lvsl v17, 0, r3 ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 9 bytes wide, output is 8 bytes.
|
||||
lvx v21, 0, r3
|
||||
lvx v22, r10, r3
|
||||
|
||||
.if \increment_counter
|
||||
add r3, r3, r4
|
||||
.endif
|
||||
vperm v21, v21, v22, v17
|
||||
|
||||
vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
|
||||
vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
|
||||
|
||||
vmsummbm v24, v20, v24, v18
|
||||
vmsummbm v25, v20, v25, v18
|
||||
|
||||
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
|
||||
|
||||
vsrh v24, v24, v19 ;# divide v0, v1 by 128
|
||||
|
||||
vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
|
||||
.endm
|
||||
|
||||
.macro vfilter_16 P0 P1
|
||||
vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
|
||||
vadduhm v22, v18, v22
|
||||
vmuloub v23, \P0, v20
|
||||
vadduhm v23, v18, v23
|
||||
|
||||
vmuleub v24, \P1, v21
|
||||
vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
|
||||
vmuloub v25, \P1, v21
|
||||
vadduhm v23, v23, v25 ;# Ro = odds
|
||||
|
||||
vsrh v22, v22, v19 ;# divide by 128
|
||||
vsrh v23, v23, v19 ;# v16 v17 = evens, odds
|
||||
vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
|
||||
vmrglh v23, v22, v23
|
||||
vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
|
||||
.endm
|
||||
|
||||
.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
|
||||
;# Compute sum first. Unpack to so signed subract
|
||||
;# can be used. Only have a half word signed
|
||||
;# subract. Do high, then low.
|
||||
vmrghb \t1, \z0, \src
|
||||
vmrghb \t2, \z0, \ref
|
||||
vsubshs \t1, \t1, \t2
|
||||
vsum4shs \sum, \t1, \sum
|
||||
|
||||
vmrglb \t1, \z0, \src
|
||||
vmrglb \t2, \z0, \ref
|
||||
vsubshs \t1, \t1, \t2
|
||||
vsum4shs \sum, \t1, \sum
|
||||
|
||||
;# Now compute sse.
|
||||
vsububs \t1, \src, \ref
|
||||
vsububs \t2, \ref, \src
|
||||
vor \t1, \t1, \t2
|
||||
|
||||
vmsumubm \sse, \t1, \t1, \sse
|
||||
.endm
|
||||
|
||||
.macro variance_final sum, sse, z0, DS
|
||||
vsumsws \sum, \sum, \z0
|
||||
vsumsws \sse, \sse, \z0
|
||||
|
||||
stvx \sum, 0, r1
|
||||
lwz r3, 12(r1)
|
||||
|
||||
stvx \sse, 0, r1
|
||||
lwz r4, 12(r1)
|
||||
|
||||
stw r4, 0(r9) ;# sse
|
||||
|
||||
mullw r3, r3, r3 ;# sum*sum
|
||||
srawi r3, r3, \DS ;# (sum*sum) >> 8
|
||||
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
|
||||
.endm
|
||||
|
||||
.macro compute_sum_sse_16 V, increment_counter
|
||||
load_and_align_16 v16, r7, r8, \increment_counter
|
||||
compute_sum_sse \V, v16, v18, v19, v20, v21, v23
|
||||
.endm
|
||||
|
||||
.macro load_and_align_16 V, R, P, increment_counter
|
||||
lvsl v17, 0, \R ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 21 bytes wide, output is 16 bytes.
|
||||
;# input will can span three vectors if not aligned correctly.
|
||||
lvx v21, 0, \R
|
||||
lvx v22, r10, \R
|
||||
|
||||
.if \increment_counter
|
||||
add \R, \R, \P
|
||||
.endif
|
||||
|
||||
vperm \V, v21, v22, v17
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_pixels_per_line
|
||||
;# r5 int xoffset
|
||||
;# r6 int yoffset
|
||||
;# r7 unsigned char *dst_ptr
|
||||
;# r8 int dst_pixels_per_line
|
||||
;# r9 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_sub_pixel_variance4x4_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xf830
|
||||
ori r12, r12, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_4x4_pre_copy_b
|
||||
|
||||
;# Load up permutation constants
|
||||
load_c v10, b_0123_b, 0, r12, r0
|
||||
load_c v11, b_4567_b, 0, r12, r0
|
||||
|
||||
hfilter_8 v0, v10, v11, 1
|
||||
hfilter_8 v1, v10, v11, 1
|
||||
hfilter_8 v2, v10, v11, 1
|
||||
hfilter_8 v3, v10, v11, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq compute_sum_sse_4x4_b
|
||||
|
||||
hfilter_8 v4, v10, v11, 0
|
||||
|
||||
b second_pass_4x4_b
|
||||
|
||||
second_pass_4x4_pre_copy_b:
|
||||
slwi r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_16 v0, r3, r4, 1
|
||||
load_and_align_16 v1, r3, r4, 1
|
||||
load_and_align_16 v2, r3, r4, 1
|
||||
load_and_align_16 v3, r3, r4, 1
|
||||
load_and_align_16 v4, r3, r4, 0
|
||||
|
||||
second_pass_4x4_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
|
||||
compute_sum_sse_4x4_b:
|
||||
vspltish v18, 0 ;# sum
|
||||
vspltish v19, 0 ;# sse
|
||||
vspltish v23, 0 ;# unpack
|
||||
li r10, 16
|
||||
|
||||
load_and_align_16 v4, r7, r8, 1
|
||||
load_and_align_16 v5, r7, r8, 1
|
||||
load_and_align_16 v6, r7, r8, 1
|
||||
load_and_align_16 v7, r7, r8, 1
|
||||
|
||||
vmrghb v0, v0, v1
|
||||
vmrghb v1, v2, v3
|
||||
|
||||
vmrghb v2, v4, v5
|
||||
vmrghb v3, v6, v7
|
||||
|
||||
load_c v10, b_hilo_b, 0, r12, r0
|
||||
|
||||
vperm v0, v0, v1, v10
|
||||
vperm v1, v2, v3, v10
|
||||
|
||||
compute_sum_sse v0, v1, v18, v19, v20, v21, v23
|
||||
|
||||
variance_final v18, v19, v23, 4
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_pixels_per_line
|
||||
;# r5 int xoffset
|
||||
;# r6 int yoffset
|
||||
;# r7 unsigned char *dst_ptr
|
||||
;# r8 int dst_pixels_per_line
|
||||
;# r9 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_sub_pixel_variance8x8_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xfff0
|
||||
ori r12, r12, 0xffff
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_8x8_pre_copy_b
|
||||
|
||||
;# Load up permutation constants
|
||||
load_c v10, b_0123_b, 0, r12, r0
|
||||
load_c v11, b_4567_b, 0, r12, r0
|
||||
|
||||
hfilter_8 v0, v10, v11, 1
|
||||
hfilter_8 v1, v10, v11, 1
|
||||
hfilter_8 v2, v10, v11, 1
|
||||
hfilter_8 v3, v10, v11, 1
|
||||
hfilter_8 v4, v10, v11, 1
|
||||
hfilter_8 v5, v10, v11, 1
|
||||
hfilter_8 v6, v10, v11, 1
|
||||
hfilter_8 v7, v10, v11, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq compute_sum_sse_8x8_b
|
||||
|
||||
hfilter_8 v8, v10, v11, 0
|
||||
|
||||
b second_pass_8x8_b
|
||||
|
||||
second_pass_8x8_pre_copy_b:
|
||||
slwi. r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_16 v0, r3, r4, 1
|
||||
load_and_align_16 v1, r3, r4, 1
|
||||
load_and_align_16 v2, r3, r4, 1
|
||||
load_and_align_16 v3, r3, r4, 1
|
||||
load_and_align_16 v4, r3, r4, 1
|
||||
load_and_align_16 v5, r3, r4, 1
|
||||
load_and_align_16 v6, r3, r4, 1
|
||||
load_and_align_16 v7, r3, r4, 1
|
||||
load_and_align_16 v8, r3, r4, 0
|
||||
|
||||
beq compute_sum_sse_8x8_b
|
||||
|
||||
second_pass_8x8_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
vfilter_16 v4, v5
|
||||
vfilter_16 v5, v6
|
||||
vfilter_16 v6, v7
|
||||
vfilter_16 v7, v8
|
||||
|
||||
compute_sum_sse_8x8_b:
|
||||
vspltish v18, 0 ;# sum
|
||||
vspltish v19, 0 ;# sse
|
||||
vspltish v23, 0 ;# unpack
|
||||
li r10, 16
|
||||
|
||||
vmrghb v0, v0, v1
|
||||
vmrghb v1, v2, v3
|
||||
vmrghb v2, v4, v5
|
||||
vmrghb v3, v6, v7
|
||||
|
||||
load_and_align_16 v4, r7, r8, 1
|
||||
load_and_align_16 v5, r7, r8, 1
|
||||
load_and_align_16 v6, r7, r8, 1
|
||||
load_and_align_16 v7, r7, r8, 1
|
||||
load_and_align_16 v8, r7, r8, 1
|
||||
load_and_align_16 v9, r7, r8, 1
|
||||
load_and_align_16 v10, r7, r8, 1
|
||||
load_and_align_16 v11, r7, r8, 0
|
||||
|
||||
vmrghb v4, v4, v5
|
||||
vmrghb v5, v6, v7
|
||||
vmrghb v6, v8, v9
|
||||
vmrghb v7, v10, v11
|
||||
|
||||
compute_sum_sse v0, v4, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v1, v5, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v2, v6, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v3, v7, v18, v19, v20, v21, v23
|
||||
|
||||
variance_final v18, v19, v23, 6
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_pixels_per_line
|
||||
;# r5 int xoffset
|
||||
;# r6 int yoffset
|
||||
;# r7 unsigned char *dst_ptr
|
||||
;# r8 int dst_pixels_per_line
|
||||
;# r9 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_sub_pixel_variance8x16_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xffff
|
||||
ori r12, r12, 0xfffc
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1,-32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_8x16_pre_copy_b
|
||||
|
||||
;# Load up permutation constants
|
||||
load_c v29, b_0123_b, 0, r12, r0
|
||||
load_c v30, b_4567_b, 0, r12, r0
|
||||
|
||||
hfilter_8 v0, v29, v30, 1
|
||||
hfilter_8 v1, v29, v30, 1
|
||||
hfilter_8 v2, v29, v30, 1
|
||||
hfilter_8 v3, v29, v30, 1
|
||||
hfilter_8 v4, v29, v30, 1
|
||||
hfilter_8 v5, v29, v30, 1
|
||||
hfilter_8 v6, v29, v30, 1
|
||||
hfilter_8 v7, v29, v30, 1
|
||||
hfilter_8 v8, v29, v30, 1
|
||||
hfilter_8 v9, v29, v30, 1
|
||||
hfilter_8 v10, v29, v30, 1
|
||||
hfilter_8 v11, v29, v30, 1
|
||||
hfilter_8 v12, v29, v30, 1
|
||||
hfilter_8 v13, v29, v30, 1
|
||||
hfilter_8 v14, v29, v30, 1
|
||||
hfilter_8 v15, v29, v30, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq compute_sum_sse_8x16_b
|
||||
|
||||
hfilter_8 v16, v29, v30, 0
|
||||
|
||||
b second_pass_8x16_b
|
||||
|
||||
second_pass_8x16_pre_copy_b:
|
||||
slwi. r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_16 v0, r3, r4, 1
|
||||
load_and_align_16 v1, r3, r4, 1
|
||||
load_and_align_16 v2, r3, r4, 1
|
||||
load_and_align_16 v3, r3, r4, 1
|
||||
load_and_align_16 v4, r3, r4, 1
|
||||
load_and_align_16 v5, r3, r4, 1
|
||||
load_and_align_16 v6, r3, r4, 1
|
||||
load_and_align_16 v7, r3, r4, 1
|
||||
load_and_align_16 v8, r3, r4, 1
|
||||
load_and_align_16 v9, r3, r4, 1
|
||||
load_and_align_16 v10, r3, r4, 1
|
||||
load_and_align_16 v11, r3, r4, 1
|
||||
load_and_align_16 v12, r3, r4, 1
|
||||
load_and_align_16 v13, r3, r4, 1
|
||||
load_and_align_16 v14, r3, r4, 1
|
||||
load_and_align_16 v15, r3, r4, 1
|
||||
load_and_align_16 v16, r3, r4, 0
|
||||
|
||||
beq compute_sum_sse_8x16_b
|
||||
|
||||
second_pass_8x16_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
vfilter_16 v4, v5
|
||||
vfilter_16 v5, v6
|
||||
vfilter_16 v6, v7
|
||||
vfilter_16 v7, v8
|
||||
vfilter_16 v8, v9
|
||||
vfilter_16 v9, v10
|
||||
vfilter_16 v10, v11
|
||||
vfilter_16 v11, v12
|
||||
vfilter_16 v12, v13
|
||||
vfilter_16 v13, v14
|
||||
vfilter_16 v14, v15
|
||||
vfilter_16 v15, v16
|
||||
|
||||
compute_sum_sse_8x16_b:
|
||||
vspltish v18, 0 ;# sum
|
||||
vspltish v19, 0 ;# sse
|
||||
vspltish v23, 0 ;# unpack
|
||||
li r10, 16
|
||||
|
||||
vmrghb v0, v0, v1
|
||||
vmrghb v1, v2, v3
|
||||
vmrghb v2, v4, v5
|
||||
vmrghb v3, v6, v7
|
||||
vmrghb v4, v8, v9
|
||||
vmrghb v5, v10, v11
|
||||
vmrghb v6, v12, v13
|
||||
vmrghb v7, v14, v15
|
||||
|
||||
load_and_align_16 v8, r7, r8, 1
|
||||
load_and_align_16 v9, r7, r8, 1
|
||||
load_and_align_16 v10, r7, r8, 1
|
||||
load_and_align_16 v11, r7, r8, 1
|
||||
load_and_align_16 v12, r7, r8, 1
|
||||
load_and_align_16 v13, r7, r8, 1
|
||||
load_and_align_16 v14, r7, r8, 1
|
||||
load_and_align_16 v15, r7, r8, 1
|
||||
|
||||
vmrghb v8, v8, v9
|
||||
vmrghb v9, v10, v11
|
||||
vmrghb v10, v12, v13
|
||||
vmrghb v11, v14, v15
|
||||
|
||||
compute_sum_sse v0, v8, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v1, v9, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v2, v10, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v3, v11, v18, v19, v20, v21, v23
|
||||
|
||||
load_and_align_16 v8, r7, r8, 1
|
||||
load_and_align_16 v9, r7, r8, 1
|
||||
load_and_align_16 v10, r7, r8, 1
|
||||
load_and_align_16 v11, r7, r8, 1
|
||||
load_and_align_16 v12, r7, r8, 1
|
||||
load_and_align_16 v13, r7, r8, 1
|
||||
load_and_align_16 v14, r7, r8, 1
|
||||
load_and_align_16 v15, r7, r8, 0
|
||||
|
||||
vmrghb v8, v8, v9
|
||||
vmrghb v9, v10, v11
|
||||
vmrghb v10, v12, v13
|
||||
vmrghb v11, v14, v15
|
||||
|
||||
compute_sum_sse v4, v8, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v5, v9, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v6, v10, v18, v19, v20, v21, v23
|
||||
compute_sum_sse v7, v11, v18, v19, v20, v21, v23
|
||||
|
||||
variance_final v18, v19, v23, 7
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
blr
|
||||
|
||||
;# Filters a horizontal line
|
||||
;# expects:
|
||||
;# r3 src_ptr
|
||||
;# r4 pitch
|
||||
;# r10 16
|
||||
;# r12 32
|
||||
;# v17 perm intput
|
||||
;# v18 rounding
|
||||
;# v19 shift
|
||||
;# v20 filter taps
|
||||
;# v21 tmp
|
||||
;# v22 tmp
|
||||
;# v23 tmp
|
||||
;# v24 tmp
|
||||
;# v25 tmp
|
||||
;# v26 tmp
|
||||
;# v27 tmp
|
||||
;# v28 perm output
|
||||
;#
|
||||
.macro hfilter_16 V, increment_counter
|
||||
|
||||
lvsl v17, 0, r3 ;# permutate value for alignment
|
||||
|
||||
;# input to filter is 21 bytes wide, output is 16 bytes.
|
||||
;# input will can span three vectors if not aligned correctly.
|
||||
lvx v21, 0, r3
|
||||
lvx v22, r10, r3
|
||||
lvx v23, r12, r3
|
||||
|
||||
.if \increment_counter
|
||||
add r3, r3, r4
|
||||
.endif
|
||||
vperm v21, v21, v22, v17
|
||||
vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
|
||||
|
||||
;# set 0
|
||||
vmsummbm v24, v20, v21, v18 ;# taps times elements
|
||||
|
||||
;# set 1
|
||||
vsldoi v23, v21, v22, 1
|
||||
vmsummbm v25, v20, v23, v18
|
||||
|
||||
;# set 2
|
||||
vsldoi v23, v21, v22, 2
|
||||
vmsummbm v26, v20, v23, v18
|
||||
|
||||
;# set 3
|
||||
vsldoi v23, v21, v22, 3
|
||||
vmsummbm v27, v20, v23, v18
|
||||
|
||||
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
|
||||
vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
|
||||
|
||||
vsrh v24, v24, v19 ;# divide v0, v1 by 128
|
||||
vsrh v25, v25, v19
|
||||
|
||||
vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
|
||||
vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
|
||||
.endm
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_pixels_per_line
|
||||
;# r5 int xoffset
|
||||
;# r6 int yoffset
|
||||
;# r7 unsigned char *dst_ptr
|
||||
;# r8 int dst_pixels_per_line
|
||||
;# r9 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_sub_pixel_variance16x8_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xffff
|
||||
ori r12, r12, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1, -32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_16x8_pre_copy_b
|
||||
|
||||
hfilter_16 v0, 1
|
||||
hfilter_16 v1, 1
|
||||
hfilter_16 v2, 1
|
||||
hfilter_16 v3, 1
|
||||
hfilter_16 v4, 1
|
||||
hfilter_16 v5, 1
|
||||
hfilter_16 v6, 1
|
||||
hfilter_16 v7, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq compute_sum_sse_16x8_b
|
||||
|
||||
hfilter_16 v8, 0
|
||||
|
||||
b second_pass_16x8_b
|
||||
|
||||
second_pass_16x8_pre_copy_b:
|
||||
slwi. r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_16 v0, r3, r4, 1
|
||||
load_and_align_16 v1, r3, r4, 1
|
||||
load_and_align_16 v2, r3, r4, 1
|
||||
load_and_align_16 v3, r3, r4, 1
|
||||
load_and_align_16 v4, r3, r4, 1
|
||||
load_and_align_16 v5, r3, r4, 1
|
||||
load_and_align_16 v6, r3, r4, 1
|
||||
load_and_align_16 v7, r3, r4, 1
|
||||
load_and_align_16 v8, r3, r4, 1
|
||||
|
||||
beq compute_sum_sse_16x8_b
|
||||
|
||||
second_pass_16x8_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
vfilter_16 v4, v5
|
||||
vfilter_16 v5, v6
|
||||
vfilter_16 v6, v7
|
||||
vfilter_16 v7, v8
|
||||
|
||||
compute_sum_sse_16x8_b:
|
||||
vspltish v18, 0 ;# sum
|
||||
vspltish v19, 0 ;# sse
|
||||
vspltish v23, 0 ;# unpack
|
||||
li r10, 16
|
||||
|
||||
compute_sum_sse_16 v0, 1
|
||||
compute_sum_sse_16 v1, 1
|
||||
compute_sum_sse_16 v2, 1
|
||||
compute_sum_sse_16 v3, 1
|
||||
compute_sum_sse_16 v4, 1
|
||||
compute_sum_sse_16 v5, 1
|
||||
compute_sum_sse_16 v6, 1
|
||||
compute_sum_sse_16 v7, 0
|
||||
|
||||
variance_final v18, v19, v23, 7
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.align 2
|
||||
;# r3 unsigned char *src_ptr
|
||||
;# r4 int src_pixels_per_line
|
||||
;# r5 int xoffset
|
||||
;# r6 int yoffset
|
||||
;# r7 unsigned char *dst_ptr
|
||||
;# r8 int dst_pixels_per_line
|
||||
;# r9 unsigned int *sse
|
||||
;#
|
||||
;# r3 return value
|
||||
vp9_sub_pixel_variance16x16_ppc:
|
||||
mfspr r11, 256 ;# get old VRSAVE
|
||||
oris r12, r11, 0xffff
|
||||
ori r12, r12, 0xfff8
|
||||
mtspr 256, r12 ;# set VRSAVE
|
||||
|
||||
stwu r1, -32(r1) ;# create space on the stack
|
||||
|
||||
HProlog second_pass_16x16_pre_copy_b
|
||||
|
||||
hfilter_16 v0, 1
|
||||
hfilter_16 v1, 1
|
||||
hfilter_16 v2, 1
|
||||
hfilter_16 v3, 1
|
||||
hfilter_16 v4, 1
|
||||
hfilter_16 v5, 1
|
||||
hfilter_16 v6, 1
|
||||
hfilter_16 v7, 1
|
||||
hfilter_16 v8, 1
|
||||
hfilter_16 v9, 1
|
||||
hfilter_16 v10, 1
|
||||
hfilter_16 v11, 1
|
||||
hfilter_16 v12, 1
|
||||
hfilter_16 v13, 1
|
||||
hfilter_16 v14, 1
|
||||
hfilter_16 v15, 1
|
||||
|
||||
;# Finished filtering main horizontal block. If there is no
|
||||
;# vertical filtering, jump to storing the data. Otherwise
|
||||
;# load up and filter the additional line that is needed
|
||||
;# for the vertical filter.
|
||||
beq compute_sum_sse_16x16_b
|
||||
|
||||
hfilter_16 v16, 0
|
||||
|
||||
b second_pass_16x16_b
|
||||
|
||||
second_pass_16x16_pre_copy_b:
|
||||
slwi. r6, r6, 5 ;# index into vertical filter array
|
||||
|
||||
load_and_align_16 v0, r3, r4, 1
|
||||
load_and_align_16 v1, r3, r4, 1
|
||||
load_and_align_16 v2, r3, r4, 1
|
||||
load_and_align_16 v3, r3, r4, 1
|
||||
load_and_align_16 v4, r3, r4, 1
|
||||
load_and_align_16 v5, r3, r4, 1
|
||||
load_and_align_16 v6, r3, r4, 1
|
||||
load_and_align_16 v7, r3, r4, 1
|
||||
load_and_align_16 v8, r3, r4, 1
|
||||
load_and_align_16 v9, r3, r4, 1
|
||||
load_and_align_16 v10, r3, r4, 1
|
||||
load_and_align_16 v11, r3, r4, 1
|
||||
load_and_align_16 v12, r3, r4, 1
|
||||
load_and_align_16 v13, r3, r4, 1
|
||||
load_and_align_16 v14, r3, r4, 1
|
||||
load_and_align_16 v15, r3, r4, 1
|
||||
load_and_align_16 v16, r3, r4, 0
|
||||
|
||||
beq compute_sum_sse_16x16_b
|
||||
|
||||
second_pass_16x16_b:
|
||||
vspltish v20, 8
|
||||
vspltish v18, 3
|
||||
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
|
||||
|
||||
load_vfilter v20, v21
|
||||
|
||||
vfilter_16 v0, v1
|
||||
vfilter_16 v1, v2
|
||||
vfilter_16 v2, v3
|
||||
vfilter_16 v3, v4
|
||||
vfilter_16 v4, v5
|
||||
vfilter_16 v5, v6
|
||||
vfilter_16 v6, v7
|
||||
vfilter_16 v7, v8
|
||||
vfilter_16 v8, v9
|
||||
vfilter_16 v9, v10
|
||||
vfilter_16 v10, v11
|
||||
vfilter_16 v11, v12
|
||||
vfilter_16 v12, v13
|
||||
vfilter_16 v13, v14
|
||||
vfilter_16 v14, v15
|
||||
vfilter_16 v15, v16
|
||||
|
||||
compute_sum_sse_16x16_b:
|
||||
vspltish v18, 0 ;# sum
|
||||
vspltish v19, 0 ;# sse
|
||||
vspltish v23, 0 ;# unpack
|
||||
li r10, 16
|
||||
|
||||
compute_sum_sse_16 v0, 1
|
||||
compute_sum_sse_16 v1, 1
|
||||
compute_sum_sse_16 v2, 1
|
||||
compute_sum_sse_16 v3, 1
|
||||
compute_sum_sse_16 v4, 1
|
||||
compute_sum_sse_16 v5, 1
|
||||
compute_sum_sse_16 v6, 1
|
||||
compute_sum_sse_16 v7, 1
|
||||
compute_sum_sse_16 v8, 1
|
||||
compute_sum_sse_16 v9, 1
|
||||
compute_sum_sse_16 v10, 1
|
||||
compute_sum_sse_16 v11, 1
|
||||
compute_sum_sse_16 v12, 1
|
||||
compute_sum_sse_16 v13, 1
|
||||
compute_sum_sse_16 v14, 1
|
||||
compute_sum_sse_16 v15, 0
|
||||
|
||||
variance_final v18, v19, v23, 8
|
||||
|
||||
addi r1, r1, 32 ;# recover stack
|
||||
|
||||
mtspr 256, r11 ;# reset old VRSAVE
|
||||
|
||||
blr
|
||||
|
||||
.data
|
||||
|
||||
.align 4
|
||||
hfilter_b:
|
||||
.byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
|
||||
.byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
|
||||
.byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
|
||||
.byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
|
||||
.byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
|
||||
.byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
|
||||
.byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
|
||||
.byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
|
||||
|
||||
.align 4
|
||||
vfilter_b:
|
||||
.byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
|
||||
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
|
||||
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
|
||||
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
|
||||
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
|
||||
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
|
||||
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
|
||||
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
|
||||
|
||||
.align 4
|
||||
b_hperm_b:
|
||||
.byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
|
||||
|
||||
.align 4
|
||||
b_0123_b:
|
||||
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
|
||||
|
||||
.align 4
|
||||
b_4567_b:
|
||||
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
|
||||
|
||||
b_hilo_b:
|
||||
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
|
Загрузка…
Ссылка в новой задаче