Merge "Remove unused vp9 ppc files" into experimental

This commit is contained in:
John Koleszar 2013-04-11 14:39:18 -07:00 коммит произвёл Gerrit Code Review
Родитель 2dc6acc0fc 633d9e7b4f
Коммит 4ba74ae81a
16 изменённых файлов: 0 добавлений и 5788 удалений

Просмотреть файл

@ -1,47 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl copy_mem16x16_ppc
;# r3 unsigned char *src
;# r4 int src_stride
;# r5 unsigned char *dst
;# r6 int dst_stride
;# Make the assumption that input will not be aligned,
;# but the output will be. So two reads and a perm
;# for the input, but only one store for the output.
copy_mem16x16_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xe000
mtspr 256, r12 ;# set VRSAVE
li r10, 16
mtctr r10
cp_16x16_loop:
lvsl v0, 0, r3 ;# permutate value for alignment
lvx v1, 0, r3
lvx v2, r10, r3
vperm v1, v1, v2, v0
stvx v1, 0, r5
add r3, r3, r4 ;# increment source pointer
add r5, r5, r6 ;# increment destination pointer
bdnz cp_16x16_loop
mtspr 256, r11 ;# reset old VRSAVE
blr

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,677 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl bilinear_predict4x4_ppc
.globl bilinear_predict8x4_ppc
.globl bilinear_predict8x8_ppc
.globl bilinear_predict16x16_ppc
.macro load_c V, LABEL, OFF, R0, R1
lis \R0, \LABEL@ha
la \R1, \LABEL@l(\R0)
lvx \V, \OFF, \R1
.endm
.macro load_vfilter V0, V1
load_c \V0, vfilter_b, r6, r9, r10
addi r6, r6, 16
lvx \V1, r6, r10
.endm
.macro HProlog jump_label
;# load up horizontal filter
slwi. r5, r5, 4 ;# index into horizontal filter array
;# index to the next set of vectors in the row.
li r10, 16
li r12, 32
;# downshift by 7 ( divide by 128 ) at the end
vspltish v19, 7
;# If there isn't any filtering to be done for the horizontal, then
;# just skip to the second pass.
beq \jump_label
load_c v20, hfilter_b, r5, r9, r0
;# setup constants
;# v14 permutation value for alignment
load_c v28, b_hperm_b, 0, r9, r0
;# rounding added in on the multiply
vspltisw v21, 8
vspltisw v18, 3
vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
slwi. r6, r6, 5 ;# index into vertical filter array
.endm
;# Filters a horizontal line
;# expects:
;# r3 src_ptr
;# r4 pitch
;# r10 16
;# r12 32
;# v17 perm intput
;# v18 rounding
;# v19 shift
;# v20 filter taps
;# v21 tmp
;# v22 tmp
;# v23 tmp
;# v24 tmp
;# v25 tmp
;# v26 tmp
;# v27 tmp
;# v28 perm output
;#
.macro HFilter V
vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456
vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A
vmsummbm v24, v20, v24, v18
vmsummbm v25, v20, v25, v18
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
vsrh v24, v24, v19 ;# divide v0, v1 by 128
vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
.endm
.macro hfilter_8 V, increment_counter
lvsl v17, 0, r3 ;# permutate value for alignment
;# input to filter is 9 bytes wide, output is 8 bytes.
lvx v21, 0, r3
lvx v22, r10, r3
.if \increment_counter
add r3, r3, r4
.endif
vperm v21, v21, v22, v17
HFilter \V
.endm
.macro load_and_align_8 V, increment_counter
lvsl v17, 0, r3 ;# permutate value for alignment
;# input to filter is 21 bytes wide, output is 16 bytes.
;# input will can span three vectors if not aligned correctly.
lvx v21, 0, r3
lvx v22, r10, r3
.if \increment_counter
add r3, r3, r4
.endif
vperm \V, v21, v22, v17
.endm
.macro write_aligned_8 V, increment_counter
stvx \V, 0, r7
.if \increment_counter
add r7, r7, r8
.endif
.endm
.macro vfilter_16 P0 P1
vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
vadduhm v22, v18, v22
vmuloub v23, \P0, v20
vadduhm v23, v18, v23
vmuleub v24, \P1, v21
vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
vmuloub v25, \P1, v21
vadduhm v23, v23, v25 ;# Ro = odds
vsrh v22, v22, v19 ;# divide by 128
vsrh v23, v23, v19 ;# v16 v17 = evens, odds
vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
vmrglh v23, v22, v23
vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
.endm
.macro w_8x8 V, D, R, P
stvx \V, 0, r1
lwz \R, 0(r1)
stw \R, 0(r7)
lwz \R, 4(r1)
stw \R, 4(r7)
add \D, \D, \P
.endm
.align 2
;# r3 unsigned char * src
;# r4 int src_pitch
;# r5 int x_offset
;# r6 int y_offset
;# r7 unsigned char * dst
;# r8 int dst_pitch
bilinear_predict4x4_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xf830
ori r12, r12, 0xfff8
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
HProlog second_pass_4x4_pre_copy_b
;# Load up permutation constants
load_c v10, b_0123_b, 0, r9, r12
load_c v11, b_4567_b, 0, r9, r12
hfilter_8 v0, 1
hfilter_8 v1, 1
hfilter_8 v2, 1
hfilter_8 v3, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq store_out_4x4_b
hfilter_8 v4, 0
b second_pass_4x4_b
second_pass_4x4_pre_copy_b:
slwi r6, r6, 5 ;# index into vertical filter array
load_and_align_8 v0, 1
load_and_align_8 v1, 1
load_and_align_8 v2, 1
load_and_align_8 v3, 1
load_and_align_8 v4, 1
second_pass_4x4_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
store_out_4x4_b:
stvx v0, 0, r1
lwz r0, 0(r1)
stw r0, 0(r7)
add r7, r7, r8
stvx v1, 0, r1
lwz r0, 0(r1)
stw r0, 0(r7)
add r7, r7, r8
stvx v2, 0, r1
lwz r0, 0(r1)
stw r0, 0(r7)
add r7, r7, r8
stvx v3, 0, r1
lwz r0, 0(r1)
stw r0, 0(r7)
exit_4x4:
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 2
;# r3 unsigned char * src
;# r4 int src_pitch
;# r5 int x_offset
;# r6 int y_offset
;# r7 unsigned char * dst
;# r8 int dst_pitch
bilinear_predict8x4_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xf830
ori r12, r12, 0xfff8
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
HProlog second_pass_8x4_pre_copy_b
;# Load up permutation constants
load_c v10, b_0123_b, 0, r9, r12
load_c v11, b_4567_b, 0, r9, r12
hfilter_8 v0, 1
hfilter_8 v1, 1
hfilter_8 v2, 1
hfilter_8 v3, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq store_out_8x4_b
hfilter_8 v4, 0
b second_pass_8x4_b
second_pass_8x4_pre_copy_b:
slwi r6, r6, 5 ;# index into vertical filter array
load_and_align_8 v0, 1
load_and_align_8 v1, 1
load_and_align_8 v2, 1
load_and_align_8 v3, 1
load_and_align_8 v4, 1
second_pass_8x4_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
store_out_8x4_b:
cmpi cr0, r8, 8
beq cr0, store_aligned_8x4_b
w_8x8 v0, r7, r0, r8
w_8x8 v1, r7, r0, r8
w_8x8 v2, r7, r0, r8
w_8x8 v3, r7, r0, r8
b exit_8x4
store_aligned_8x4_b:
load_c v10, b_hilo_b, 0, r9, r10
vperm v0, v0, v1, v10
vperm v2, v2, v3, v10
stvx v0, 0, r7
addi r7, r7, 16
stvx v2, 0, r7
exit_8x4:
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 2
;# r3 unsigned char * src
;# r4 int src_pitch
;# r5 int x_offset
;# r6 int y_offset
;# r7 unsigned char * dst
;# r8 int dst_pitch
bilinear_predict8x8_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xfff0
ori r12, r12, 0xffff
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
HProlog second_pass_8x8_pre_copy_b
;# Load up permutation constants
load_c v10, b_0123_b, 0, r9, r12
load_c v11, b_4567_b, 0, r9, r12
hfilter_8 v0, 1
hfilter_8 v1, 1
hfilter_8 v2, 1
hfilter_8 v3, 1
hfilter_8 v4, 1
hfilter_8 v5, 1
hfilter_8 v6, 1
hfilter_8 v7, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq store_out_8x8_b
hfilter_8 v8, 0
b second_pass_8x8_b
second_pass_8x8_pre_copy_b:
slwi r6, r6, 5 ;# index into vertical filter array
load_and_align_8 v0, 1
load_and_align_8 v1, 1
load_and_align_8 v2, 1
load_and_align_8 v3, 1
load_and_align_8 v4, 1
load_and_align_8 v5, 1
load_and_align_8 v6, 1
load_and_align_8 v7, 1
load_and_align_8 v8, 0
second_pass_8x8_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
vfilter_16 v4, v5
vfilter_16 v5, v6
vfilter_16 v6, v7
vfilter_16 v7, v8
store_out_8x8_b:
cmpi cr0, r8, 8
beq cr0, store_aligned_8x8_b
w_8x8 v0, r7, r0, r8
w_8x8 v1, r7, r0, r8
w_8x8 v2, r7, r0, r8
w_8x8 v3, r7, r0, r8
w_8x8 v4, r7, r0, r8
w_8x8 v5, r7, r0, r8
w_8x8 v6, r7, r0, r8
w_8x8 v7, r7, r0, r8
b exit_8x8
store_aligned_8x8_b:
load_c v10, b_hilo_b, 0, r9, r10
vperm v0, v0, v1, v10
vperm v2, v2, v3, v10
vperm v4, v4, v5, v10
vperm v6, v6, v7, v10
stvx v0, 0, r7
addi r7, r7, 16
stvx v2, 0, r7
addi r7, r7, 16
stvx v4, 0, r7
addi r7, r7, 16
stvx v6, 0, r7
exit_8x8:
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
;# Filters a horizontal line
;# expects:
;# r3 src_ptr
;# r4 pitch
;# r10 16
;# r12 32
;# v17 perm intput
;# v18 rounding
;# v19 shift
;# v20 filter taps
;# v21 tmp
;# v22 tmp
;# v23 tmp
;# v24 tmp
;# v25 tmp
;# v26 tmp
;# v27 tmp
;# v28 perm output
;#
.macro hfilter_16 V, increment_counter
lvsl v17, 0, r3 ;# permutate value for alignment
;# input to filter is 21 bytes wide, output is 16 bytes.
;# input will can span three vectors if not aligned correctly.
lvx v21, 0, r3
lvx v22, r10, r3
lvx v23, r12, r3
.if \increment_counter
add r3, r3, r4
.endif
vperm v21, v21, v22, v17
vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
;# set 0
vmsummbm v24, v20, v21, v18 ;# taps times elements
;# set 1
vsldoi v23, v21, v22, 1
vmsummbm v25, v20, v23, v18
;# set 2
vsldoi v23, v21, v22, 2
vmsummbm v26, v20, v23, v18
;# set 3
vsldoi v23, v21, v22, 3
vmsummbm v27, v20, v23, v18
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
vsrh v24, v24, v19 ;# divide v0, v1 by 128
vsrh v25, v25, v19
vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
.endm
.macro load_and_align_16 V, increment_counter
lvsl v17, 0, r3 ;# permutate value for alignment
;# input to filter is 21 bytes wide, output is 16 bytes.
;# input will can span three vectors if not aligned correctly.
lvx v21, 0, r3
lvx v22, r10, r3
.if \increment_counter
add r3, r3, r4
.endif
vperm \V, v21, v22, v17
.endm
.macro write_16 V, increment_counter
stvx \V, 0, r7
.if \increment_counter
add r7, r7, r8
.endif
.endm
.align 2
;# r3 unsigned char * src
;# r4 int src_pitch
;# r5 int x_offset
;# r6 int y_offset
;# r7 unsigned char * dst
;# r8 int dst_pitch
bilinear_predict16x16_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xffff
ori r12, r12, 0xfff8
mtspr 256, r12 ;# set VRSAVE
HProlog second_pass_16x16_pre_copy_b
hfilter_16 v0, 1
hfilter_16 v1, 1
hfilter_16 v2, 1
hfilter_16 v3, 1
hfilter_16 v4, 1
hfilter_16 v5, 1
hfilter_16 v6, 1
hfilter_16 v7, 1
hfilter_16 v8, 1
hfilter_16 v9, 1
hfilter_16 v10, 1
hfilter_16 v11, 1
hfilter_16 v12, 1
hfilter_16 v13, 1
hfilter_16 v14, 1
hfilter_16 v15, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq store_out_16x16_b
hfilter_16 v16, 0
b second_pass_16x16_b
second_pass_16x16_pre_copy_b:
slwi r6, r6, 5 ;# index into vertical filter array
load_and_align_16 v0, 1
load_and_align_16 v1, 1
load_and_align_16 v2, 1
load_and_align_16 v3, 1
load_and_align_16 v4, 1
load_and_align_16 v5, 1
load_and_align_16 v6, 1
load_and_align_16 v7, 1
load_and_align_16 v8, 1
load_and_align_16 v9, 1
load_and_align_16 v10, 1
load_and_align_16 v11, 1
load_and_align_16 v12, 1
load_and_align_16 v13, 1
load_and_align_16 v14, 1
load_and_align_16 v15, 1
load_and_align_16 v16, 0
second_pass_16x16_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
vfilter_16 v4, v5
vfilter_16 v5, v6
vfilter_16 v6, v7
vfilter_16 v7, v8
vfilter_16 v8, v9
vfilter_16 v9, v10
vfilter_16 v10, v11
vfilter_16 v11, v12
vfilter_16 v12, v13
vfilter_16 v13, v14
vfilter_16 v14, v15
vfilter_16 v15, v16
store_out_16x16_b:
write_16 v0, 1
write_16 v1, 1
write_16 v2, 1
write_16 v3, 1
write_16 v4, 1
write_16 v5, 1
write_16 v6, 1
write_16 v7, 1
write_16 v8, 1
write_16 v9, 1
write_16 v10, 1
write_16 v11, 1
write_16 v12, 1
write_16 v13, 1
write_16 v14, 1
write_16 v15, 0
mtspr 256, r11 ;# reset old VRSAVE
blr
.data
.align 4
hfilter_b:
.byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
.byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
.byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
.byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
.byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
.byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
.byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
.byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
.align 4
vfilter_b:
.byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
.align 4
b_hperm_b:
.byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
.align 4
b_0123_b:
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
.align 4
b_4567_b:
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
b_hilo_b:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23

Просмотреть файл

@ -1,189 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl short_idct4x4_ppc
.macro load_c V, LABEL, OFF, R0, R1
lis \R0, \LABEL@ha
la \R1, \LABEL@l(\R0)
lvx \V, \OFF, \R1
.endm
;# r3 short *input
;# r4 short *output
;# r5 int pitch
.align 2
short_idct4x4_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xfff8
mtspr 256, r12 ;# set VRSAVE
load_c v8, sinpi8sqrt2, 0, r9, r10
load_c v9, cospi8sqrt2minus1, 0, r9, r10
load_c v10, hi_hi, 0, r9, r10
load_c v11, lo_lo, 0, r9, r10
load_c v12, shift_16, 0, r9, r10
li r10, 16
lvx v0, 0, r3 ;# input ip[0], ip[ 4]
lvx v1, r10, r3 ;# input ip[8], ip[12]
;# first pass
vupkhsh v2, v0
vupkhsh v3, v1
vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
vupklsh v0, v0
vmulosh v4, v0, v8
vsraw v4, v4, v12
vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
vupklsh v1, v1
vmulosh v5, v1, v9
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v1
vsubsws v4, v4, v5 ;# c1
vmulosh v3, v1, v8
vsraw v3, v3, v12
vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
vmulosh v5, v0, v9
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v0
vaddsws v3, v3, v5 ;# d1
vaddsws v0, v6, v3 ;# a1 + d1
vsubsws v3, v6, v3 ;# a1 - d1
vaddsws v1, v7, v4 ;# b1 + c1
vsubsws v2, v7, v4 ;# b1 - c1
;# transpose input
vmrghw v4, v0, v1 ;# a0 b0 a1 b1
vmrghw v5, v2, v3 ;# c0 d0 c1 d1
vmrglw v6, v0, v1 ;# a2 b2 a3 b3
vmrglw v7, v2, v3 ;# c2 d2 c3 d3
vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
;# second pass
vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
vmulosh v4, v1, v8
vsraw v4, v4, v12
vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
vmulosh v5, v3, v9
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v3
vsubsws v4, v4, v5 ;# c1
vmulosh v2, v3, v8
vsraw v2, v2, v12
vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
vmulosh v5, v1, v9
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v1
vaddsws v3, v2, v5 ;# d1
vaddsws v0, v6, v3 ;# a1 + d1
vsubsws v3, v6, v3 ;# a1 - d1
vaddsws v1, v7, v4 ;# b1 + c1
vsubsws v2, v7, v4 ;# b1 - c1
vspltish v6, 4
vspltish v7, 3
vpkswss v0, v0, v1
vpkswss v1, v2, v3
vaddshs v0, v0, v6
vaddshs v1, v1, v6
vsrah v0, v0, v7
vsrah v1, v1, v7
;# transpose output
vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
stwu r1,-416(r1) ;# create space on the stack
stvx v0, 0, r1
lwz r6, 0(r1)
stw r6, 0(r4)
lwz r6, 4(r1)
stw r6, 4(r4)
add r4, r4, r5
lwz r6, 8(r1)
stw r6, 0(r4)
lwz r6, 12(r1)
stw r6, 4(r4)
add r4, r4, r5
stvx v1, 0, r1
lwz r6, 0(r1)
stw r6, 0(r4)
lwz r6, 4(r1)
stw r6, 4(r4)
add r4, r4, r5
lwz r6, 8(r1)
stw r6, 0(r4)
lwz r6, 12(r1)
stw r6, 4(r4)
addi r1, r1, 416 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 4
sinpi8sqrt2:
.short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
.align 4
cospi8sqrt2minus1:
.short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
.align 4
shift_16:
.long 16, 16, 16, 16
.align 4
hi_hi:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
.align 4
lo_lo:
.byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

Просмотреть файл

@ -1,127 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
typedef void loop_filter_function_y_ppc
(
unsigned char *s, // source pointer
int p, // pitch
const signed char *flimit,
const signed char *limit,
const signed char *thresh
);
typedef void loop_filter_function_uv_ppc
(
unsigned char *u, // source pointer
unsigned char *v, // source pointer
int p, // pitch
const signed char *flimit,
const signed char *limit,
const signed char *thresh
);
typedef void loop_filter_function_s_ppc
(
unsigned char *s, // source pointer
int p, // pitch
const signed char *flimit
);
loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
// Horizontal MB filtering
void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
if (u_ptr)
mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
}
void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
}
// Vertical MB Filtering
void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
if (u_ptr)
mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
}
void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
}
// Horizontal B Filtering
void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
// These should all be done at once with one call, instead of 3
loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
if (u_ptr)
loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
}
void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
}
// Vertical B Filtering
void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
if (u_ptr)
loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
}
void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim);
loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim);
loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,59 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl save_platform_context
.globl restore_platform_context
.macro W V P
stvx \V, 0, \P
addi \P, \P, 16
.endm
.macro R V P
lvx \V, 0, \P
addi \P, \P, 16
.endm
;# r3 context_ptr
.align 2
save_platform_contex:
W v20, r3
W v21, r3
W v22, r3
W v23, r3
W v24, r3
W v25, r3
W v26, r3
W v27, r3
W v28, r3
W v29, r3
W v30, r3
W v31, r3
blr
;# r3 context_ptr
.align 2
restore_platform_context:
R v20, r3
R v21, r3
R v22, r3
R v23, r3
R v24, r3
R v25, r3
R v26, r3
R v27, r3
R v28, r3
R v29, r3
R v30, r3
R v31, r3
blr

Просмотреть файл

@ -1,175 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl recon4b_ppc
.globl recon2b_ppc
.globl recon_b_ppc
.macro row_of16 Diff Pred Dst Stride
lvx v1, 0, \Pred ;# v1 = pred = p0..p15
addi \Pred, \Pred, 16 ;# next pred
vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
lvx v3, 0, \Diff ;# v3 = d0..d7
vaddshs v2, v2, v3 ;# v2 = r0..r7
vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
lvx v3, r8, \Diff ;# v3 = d8..d15
addi \Diff, \Diff, 32 ;# next diff
vaddshs v3, v3, v1 ;# v3 = r8..r15
vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15
stvx v2, 0, \Dst ;# to dst
add \Dst, \Dst, \Stride ;# next dst
.endm
.text
.align 2
;# r3 = short *diff_ptr,
;# r4 = unsigned char *pred_ptr,
;# r5 = unsigned char *dst_ptr,
;# r6 = int stride
recon4b_ppc:
mfspr r0, 256 ;# get old VRSAVE
stw r0, -8(r1) ;# save old VRSAVE to stack
oris r0, r0, 0xf000
mtspr 256,r0 ;# set VRSAVE
vxor v0, v0, v0
li r8, 16
row_of16 r3, r4, r5, r6
row_of16 r3, r4, r5, r6
row_of16 r3, r4, r5, r6
row_of16 r3, r4, r5, r6
lwz r12, -8(r1) ;# restore old VRSAVE from stack
mtspr 256, r12 ;# reset old VRSAVE
blr
.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
lvx v1, 0, \Pred ;# v1 = pred = p0..p15
vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7
lvx v3, 0, \Diff ;# v3 = d0..d7
vaddshs v2, v2, v3 ;# v2 = r0..r7
vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15
lvx v3, r8, \Diff ;# v2 = d8..d15
vaddshs v3, v3, v1 ;# v3 = r8..r15
vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15
stvx v2, 0, r10 ;# 2 rows to dst from buf
lwz r0, 0(r10)
.if \write_first_four_pels
stw r0, 0(\Dst)
.else
stwux r0, \Dst, \Stride
.endif
lwz r0, 4(r10)
stw r0, 4(\Dst)
lwz r0, 8(r10)
stwux r0, \Dst, \Stride ;# advance dst to next row
lwz r0, 12(r10)
stw r0, 4(\Dst)
.endm
.align 2
;# r3 = short *diff_ptr,
;# r4 = unsigned char *pred_ptr,
;# r5 = unsigned char *dst_ptr,
;# r6 = int stride
recon2b_ppc:
mfspr r0, 256 ;# get old VRSAVE
stw r0, -8(r1) ;# save old VRSAVE to stack
oris r0, r0, 0xf000
mtspr 256,r0 ;# set VRSAVE
vxor v0, v0, v0
li r8, 16
la r10, -48(r1) ;# buf
two_rows_of8 r3, r4, r5, r6, 1
addi r4, r4, 16; ;# next pred
addi r3, r3, 32; ;# next diff
two_rows_of8 r3, r4, r5, r6, 0
lwz r12, -8(r1) ;# restore old VRSAVE from stack
mtspr 256, r12 ;# reset old VRSAVE
blr
.macro get_two_diff_rows
stw r0, 0(r10)
lwz r0, 4(r3)
stw r0, 4(r10)
lwzu r0, 32(r3)
stw r0, 8(r10)
lwz r0, 4(r3)
stw r0, 12(r10)
lvx v3, 0, r10
.endm
.align 2
;# r3 = short *diff_ptr,
;# r4 = unsigned char *pred_ptr,
;# r5 = unsigned char *dst_ptr,
;# r6 = int stride
recon_b_ppc:
mfspr r0, 256 ;# get old VRSAVE
stw r0, -8(r1) ;# save old VRSAVE to stack
oris r0, r0, 0xf000
mtspr 256,r0 ;# set VRSAVE
vxor v0, v0, v0
la r10, -48(r1) ;# buf
lwz r0, 0(r4)
stw r0, 0(r10)
lwz r0, 16(r4)
stw r0, 4(r10)
lwz r0, 32(r4)
stw r0, 8(r10)
lwz r0, 48(r4)
stw r0, 12(r10)
lvx v1, 0, r10; ;# v1 = pred = p0..p15
lwz r0, 0(r3) ;# v3 = d0..d7
get_two_diff_rows
vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7
vaddshs v2, v2, v3; ;# v2 = r0..r7
lwzu r0, 32(r3) ;# v3 = d8..d15
get_two_diff_rows
vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15
vaddshs v3, v3, v1; ;# v3 = r8..r15
vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15
stvx v2, 0, r10; ;# 16 pels to dst from buf
lwz r0, 0(r10)
stw r0, 0(r5)
lwz r0, 4(r10)
stwux r0, r5, r6
lwz r0, 8(r10)
stwux r0, r5, r6
lwz r0, 12(r10)
stwx r0, r5, r6
lwz r12, -8(r1) ;# restore old VRSAVE from stack
mtspr 256, r12 ;# reset old VRSAVE
blr

Просмотреть файл

@ -1,167 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/vp9_loopfilter.h"
#include "recon.h"
#include "vp9/common/vp9_onyxc_int.h"
void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows, int cols, int flimit);
extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch,
int rows, int cols, int flimit);
extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
int rows, int cols, int flimit);
extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch,
int rows, int cols, int flimit);
extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
int rows, int cols, int flimit);
extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
unsigned char *dst_ptr,
int src_pixels_per_line,
int dst_pixels_per_line,
int rows, int cols, int flimit);
void vp9_plane_add_noise_c(unsigned char *start,
unsigned int width, unsigned int height,
int pitch, int q, int a);
extern copy_mem_block_function *vp9_copy_mem16x16;
extern copy_mem_block_function *vp9_copy_mem8x8;
extern copy_mem_block_function *vp9_copy_mem8x4;
// PPC
extern subpixel_predict_function sixtap_predict_ppc;
extern subpixel_predict_function sixtap_predict8x4_ppc;
extern subpixel_predict_function sixtap_predict8x8_ppc;
extern subpixel_predict_function sixtap_predict16x16_ppc;
extern subpixel_predict_function bilinear_predict4x4_ppc;
extern subpixel_predict_function bilinear_predict8x4_ppc;
extern subpixel_predict_function bilinear_predict8x8_ppc;
extern subpixel_predict_function bilinear_predict16x16_ppc;
extern copy_mem_block_function copy_mem16x16_ppc;
void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr,
unsigned char *dst_ptr, int stride);
void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr,
unsigned char *dst_ptr, int stride);
void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr,
unsigned char *dst_ptr, int stride);
extern void short_idct4x4_ppc(short *input, short *output, int pitch);
// Generic C
extern subpixel_predict_function vp9_sixtap_predict_c;
extern subpixel_predict_function vp9_sixtap_predict8x4_c;
extern subpixel_predict_function vp9_sixtap_predict8x8_c;
extern subpixel_predict_function vp9_sixtap_predict16x16_c;
extern subpixel_predict_function vp9_bilinear_predict4x4_c;
extern subpixel_predict_function vp9_bilinear_predict8x4_c;
extern subpixel_predict_function vp9_bilinear_predict8x8_c;
extern subpixel_predict_function vp9_bilinear_predict16x16_c;
extern copy_mem_block_function vp9_copy_mem16x16_c;
extern copy_mem_block_function vp9_copy_mem8x8_c;
extern copy_mem_block_function vp9_copy_mem8x4_c;
void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr,
unsigned char *dst_ptr, int stride);
void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr,
unsigned char *dst_ptr, int stride);
void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr,
unsigned char *dst_ptr, int stride);
extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);
extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);
extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
// PPC
extern loop_filter_block_function loop_filter_mbv_ppc;
extern loop_filter_block_function loop_filter_bv_ppc;
extern loop_filter_block_function loop_filter_mbh_ppc;
extern loop_filter_block_function loop_filter_bh_ppc;
extern loop_filter_block_function loop_filter_mbvs_ppc;
extern loop_filter_block_function loop_filter_bvs_ppc;
extern loop_filter_block_function loop_filter_mbhs_ppc;
extern loop_filter_block_function loop_filter_bhs_ppc;
// Generic C
extern loop_filter_block_function vp9_loop_filter_mbv_c;
extern loop_filter_block_function vp9_loop_filter_bv_c;
extern loop_filter_block_function vp9_loop_filter_mbh_c;
extern loop_filter_block_function vp9_loop_filter_bh_c;
extern loop_filter_block_function vp9_loop_filter_mbvs_c;
extern loop_filter_block_function vp9_loop_filter_bvs_c;
extern loop_filter_block_function vp9_loop_filter_mbhs_c;
extern loop_filter_block_function vp9_loop_filter_bhs_c;
extern loop_filter_block_function *vp8_lf_mbvfull;
extern loop_filter_block_function *vp8_lf_mbhfull;
extern loop_filter_block_function *vp8_lf_bvfull;
extern loop_filter_block_function *vp8_lf_bhfull;
extern loop_filter_block_function *vp8_lf_mbvsimple;
extern loop_filter_block_function *vp8_lf_mbhsimple;
extern loop_filter_block_function *vp8_lf_bvsimple;
extern loop_filter_block_function *vp8_lf_bhsimple;
void vp9_clear_c(void) {
}
void vp9_machine_specific_config(void) {
// Pure C:
vp9_clear_system_state = vp9_clear_c;
vp9_recon_b = vp9_recon_b_c;
vp9_recon4b = vp9_recon4b_c;
vp9_recon2b = vp9_recon2b_c;
vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc;
vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc;
vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc;
vp8_bilinear_predict = bilinear_predict4x4_ppc;
vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc;
vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc;
vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc;
vp9_sixtap_predict = sixtap_predict_ppc;
vp8_short_idct4x4_1 = vp9_short_idct4x4_1_c;
vp8_short_idct4x4 = short_idct4x4_ppc;
vp8_dc_only_idct = vp8_dc_only_idct_c;
vp8_lf_mbvfull = loop_filter_mbv_ppc;
vp8_lf_bvfull = loop_filter_bv_ppc;
vp8_lf_mbhfull = loop_filter_mbh_ppc;
vp8_lf_bhfull = loop_filter_bh_ppc;
vp8_lf_mbvsimple = loop_filter_mbvs_ppc;
vp8_lf_bvsimple = loop_filter_bvs_ppc;
vp8_lf_mbhsimple = loop_filter_mbhs_ppc;
vp8_lf_bhsimple = loop_filter_bhs_ppc;
vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c;
vp9_mbpost_proc_down = vp9_mbpost_proc_down_c;
vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c;
vp9_plane_add_noise = vp9_plane_add_noise_c;
vp9_copy_mem16x16 = copy_mem16x16_ppc;
vp9_copy_mem8x8 = vp9_copy_mem8x8_c;
vp9_copy_mem8x4 = vp9_copy_mem8x4_c;
}

Просмотреть файл

@ -1,155 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/encoder/vp9_variance.h"
#include "vp9/encoder/vp9_onyx_int.h"
SADFunction *vp9_sad16x16;
SADFunction *vp9_sad16x8;
SADFunction *vp9_sad8x16;
SADFunction *vp9_sad8x8;
SADFunction *vp9_sad4x4;
variance_function *vp9_variance4x4;
variance_function *vp9_variance8x8;
variance_function *vp9_variance8x16;
variance_function *vp9_variance16x8;
variance_function *vp9_variance16x16;
variance_function *vp9_mse16x16;
sub_pixel_variance_function *vp9_sub_pixel_variance4x4;
sub_pixel_variance_function *vp9_sub_pixel_variance8x8;
sub_pixel_variance_function *vp9_sub_pixel_variance8x16;
sub_pixel_variance_function *vp9_sub_pixel_variance16x8;
sub_pixel_variance_function *vp9_sub_pixel_variance16x16;
int (*vp9_block_error)(short *coeff, short *dqcoeff);
int (*vp9_mbblock_error)(MACROBLOCK *mb, int dc);
int (*vp9_mbuverror)(MACROBLOCK *mb);
unsigned int (*vp9_get_mb_ss)(short *);
void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
void (*vp9_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
void (*vp9_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
void (*vp9_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
// c imports
extern int block_error_c(short *coeff, short *dqcoeff);
extern int vp9_mbblock_error_c(MACROBLOCK *mb, int dc);
extern int vp9_mbuverror_c(MACROBLOCK *mb);
extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
extern void short_fdct4x4_c(short *input, short *output, int pitch);
extern void short_fdct8x4_c(short *input, short *output, int pitch);
extern void vp9_short_walsh4x4_c(short *input, short *output, int pitch);
extern void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
extern SADFunction sad16x16_c;
extern SADFunction sad16x8_c;
extern SADFunction sad8x16_c;
extern SADFunction sad8x8_c;
extern SADFunction sad4x4_c;
extern variance_function variance16x16_c;
extern variance_function variance8x16_c;
extern variance_function variance16x8_c;
extern variance_function variance8x8_c;
extern variance_function variance4x4_c;
extern variance_function mse16x16_c;
extern sub_pixel_variance_function sub_pixel_variance4x4_c;
extern sub_pixel_variance_function sub_pixel_variance8x8_c;
extern sub_pixel_variance_function sub_pixel_variance8x16_c;
extern sub_pixel_variance_function sub_pixel_variance16x8_c;
extern sub_pixel_variance_function sub_pixel_variance16x16_c;
extern unsigned int vp9_get_mb_ss_c(short *);
// ppc
extern int vp9_block_error_ppc(short *coeff, short *dqcoeff);
extern void vp9_short_fdct4x4_ppc(short *input, short *output, int pitch);
extern void vp9_short_fdct8x4_ppc(short *input, short *output, int pitch);
extern void vp9_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
extern void vp9_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
extern SADFunction vp9_sad16x16_ppc;
extern SADFunction vp9_sad16x8_ppc;
extern SADFunction vp9_sad8x16_ppc;
extern SADFunction vp9_sad8x8_ppc;
extern SADFunction vp9_sad4x4_ppc;
extern variance_function vp9_variance16x16_ppc;
extern variance_function vp9_variance8x16_ppc;
extern variance_function vp9_variance16x8_ppc;
extern variance_function vp9_variance8x8_ppc;
extern variance_function vp9_variance4x4_ppc;
extern variance_function vp9_mse16x16_ppc;
extern sub_pixel_variance_function vp9_sub_pixel_variance4x4_ppc;
extern sub_pixel_variance_function vp9_sub_pixel_variance8x8_ppc;
extern sub_pixel_variance_function vp9_sub_pixel_variance8x16_ppc;
extern sub_pixel_variance_function vp9_sub_pixel_variance16x8_ppc;
extern sub_pixel_variance_function vp9_sub_pixel_variance16x16_ppc;
extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum);
void vp9_cmachine_specific_config(void) {
// Pure C:
vp9_mbuverror = vp9_mbuverror_c;
vp8_fast_quantize_b = vp8_fast_quantize_b_c;
vp9_short_fdct4x4 = vp9_short_fdct4x4_ppc;
vp9_short_fdct8x4 = vp9_short_fdct8x4_ppc;
vp8_fast_fdct4x4 = vp9_short_fdct4x4_ppc;
vp8_fast_fdct8x4 = vp9_short_fdct8x4_ppc;
short_walsh4x4 = vp9_short_walsh4x4_c;
vp9_variance4x4 = vp9_variance4x4_ppc;
vp9_variance8x8 = vp9_variance8x8_ppc;
vp9_variance8x16 = vp9_variance8x16_ppc;
vp9_variance16x8 = vp9_variance16x8_ppc;
vp9_variance16x16 = vp9_variance16x16_ppc;
vp9_mse16x16 = vp9_mse16x16_ppc;
vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ppc;
vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ppc;
vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ppc;
vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ppc;
vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ppc;
vp9_get_mb_ss = vp9_get_mb_ss_c;
vp9_sad16x16 = vp9_sad16x16_ppc;
vp9_sad16x8 = vp9_sad16x8_ppc;
vp9_sad8x16 = vp9_sad8x16_ppc;
vp9_sad8x8 = vp9_sad8x8_ppc;
vp9_sad4x4 = vp9_sad4x4_ppc;
vp9_block_error = vp9_block_error_ppc;
vp9_mbblock_error = vp9_mbblock_error_c;
vp9_subtract_b = vp9_subtract_b_c;
vp9_subtract_mby = vp9_subtract_mby_ppc;
vp9_subtract_mbuv = vp9_subtract_mbuv_ppc;
}

Просмотреть файл

@ -1,153 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl vp8_subtract_mbuv_ppc
.globl vp8_subtract_mby_ppc
;# r3 short *diff
;# r4 unsigned char *usrc
;# r5 unsigned char *vsrc
;# r6 unsigned char *pred
;# r7 int stride
vp8_subtract_mbuv_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xf000
mtspr 256, r12 ;# set VRSAVE
li r9, 256
add r3, r3, r9
add r3, r3, r9
add r6, r6, r9
li r10, 16
li r9, 4
mtctr r9
vspltisw v0, 0
mbu_loop:
lvsl v5, 0, r4 ;# permutate value for alignment
lvx v1, 0, r4 ;# src
lvx v2, 0, r6 ;# pred
add r4, r4, r7
addi r6, r6, 16
vperm v1, v1, v0, v5
vmrghb v3, v0, v1 ;# unpack high src to short
vmrghb v4, v0, v2 ;# unpack high pred to short
lvsl v5, 0, r4 ;# permutate value for alignment
lvx v1, 0, r4 ;# src
add r4, r4, r7
vsubshs v3, v3, v4
stvx v3, 0, r3 ;# store out diff
vperm v1, v1, v0, v5
vmrghb v3, v0, v1 ;# unpack high src to short
vmrglb v4, v0, v2 ;# unpack high pred to short
vsubshs v3, v3, v4
stvx v3, r10, r3 ;# store out diff
addi r3, r3, 32
bdnz mbu_loop
mtctr r9
mbv_loop:
lvsl v5, 0, r5 ;# permutate value for alignment
lvx v1, 0, r5 ;# src
lvx v2, 0, r6 ;# pred
add r5, r5, r7
addi r6, r6, 16
vperm v1, v1, v0, v5
vmrghb v3, v0, v1 ;# unpack high src to short
vmrghb v4, v0, v2 ;# unpack high pred to short
lvsl v5, 0, r5 ;# permutate value for alignment
lvx v1, 0, r5 ;# src
add r5, r5, r7
vsubshs v3, v3, v4
stvx v3, 0, r3 ;# store out diff
vperm v1, v1, v0, v5
vmrghb v3, v0, v1 ;# unpack high src to short
vmrglb v4, v0, v2 ;# unpack high pred to short
vsubshs v3, v3, v4
stvx v3, r10, r3 ;# store out diff
addi r3, r3, 32
bdnz mbv_loop
mtspr 256, r11 ;# reset old VRSAVE
blr
;# r3 short *diff
;# r4 unsigned char *src
;# r5 unsigned char *pred
;# r6 int stride
vp8_subtract_mby_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xf800
mtspr 256, r12 ;# set VRSAVE
li r10, 16
mtctr r10
vspltisw v0, 0
mby_loop:
lvx v1, 0, r4 ;# src
lvx v2, 0, r5 ;# pred
add r4, r4, r6
addi r5, r5, 16
vmrghb v3, v0, v1 ;# unpack high src to short
vmrghb v4, v0, v2 ;# unpack high pred to short
vsubshs v3, v3, v4
stvx v3, 0, r3 ;# store out diff
vmrglb v3, v0, v1 ;# unpack low src to short
vmrglb v4, v0, v2 ;# unpack low pred to short
vsubshs v3, v3, v4
stvx v3, r10, r3 ;# store out diff
addi r3, r3, 32
bdnz mby_loop
mtspr 256, r11 ;# reset old VRSAVE
blr

Просмотреть файл

@ -1,205 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl vp8_short_fdct4x4_ppc
.globl vp8_short_fdct8x4_ppc
.macro load_c V, LABEL, OFF, R0, R1
lis \R0, \LABEL@ha
la \R1, \LABEL@l(\R0)
lvx \V, \OFF, \R1
.endm
;# Forward and inverse DCTs are nearly identical; only differences are
;# in normalization (fwd is twice unitary, inv is half unitary)
;# and that they are of course transposes of each other.
;#
;# The following three accomplish most of implementation and
;# are used only by ppc_idct.c and ppc_fdct.c.
.macro prologue
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xfffc
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
li r6, 16
load_c v0, dct_tab, 0, r9, r10
lvx v1, r6, r10
addi r10, r10, 32
lvx v2, 0, r10
lvx v3, r6, r10
load_c v4, ppc_dctperm_tab, 0, r9, r10
load_c v5, ppc_dctperm_tab, r6, r9, r10
load_c v6, round_tab, 0, r10, r9
.endm
.macro epilogue
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
.endm
;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
;# a/A are the even rows 0,2 b/B are the odd rows 1,3
;# For fwd transform, indices are horizontal positions, then frequencies.
;# For inverse transform, frequencies then positions.
;# The two resulting A0..A3 B0..B3 are later combined
;# and vertically transformed.
.macro two_rows_horiz Dst
vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
vmsumshm v10, v0, v8, v6
vmsumshm v10, v1, v9, v10
vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
vmsumshm v11, v2, v8, v6
vmsumshm v11, v3, v9, v11
vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
.endm
;# Vertical xf on two rows. DCT values in comments are for inverse transform;
;# forward transform uses transpose.
.macro two_rows_vert Ceven, Codd
vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
vmsumshm v8, v8, v12, v6
vmsumshm v8, v9, v13, v8
vsraw v10, v8, v7
vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
vmsumshm v8, v8, v12, v6
vmsumshm v8, v9, v13, v8
vsraw v8, v8, v7
vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
.endm
.macro two_rows_h Dest
stw r0, 0(r8)
lwz r0, 4(r3)
stw r0, 4(r8)
lwzux r0, r3,r5
stw r0, 8(r8)
lwz r0, 4(r3)
stw r0, 12(r8)
lvx v8, 0,r8
two_rows_horiz \Dest
.endm
.align 2
;# r3 short *input
;# r4 short *output
;# r5 int pitch
vp8_short_fdct4x4_ppc:
prologue
vspltisw v7, 14 ;# == 14, fits in 5 signed bits
addi r8, r1, 0
lwz r0, 0(r3)
two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
lwzux r0, r3, r5
two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
lvx v6, r6, r9 ;# v6 = Vround
vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
two_rows_vert v0, v1
stvx v8, 0, r4
two_rows_vert v2, v3
stvx v8, r6, r4
epilogue
blr
.align 2
;# r3 short *input
;# r4 short *output
;# r5 int pitch
vp8_short_fdct8x4_ppc:
prologue
vspltisw v7, 14 ;# == 14, fits in 5 signed bits
addi r8, r1, 0
addi r10, r3, 0
lwz r0, 0(r3)
two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
lwzux r0, r3, r5
two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
lvx v6, r6, r9 ;# v6 = Vround
vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
two_rows_vert v0, v1
stvx v8, 0, r4
two_rows_vert v2, v3
stvx v8, r6, r4
;# Next block
addi r3, r10, 8
addi r4, r4, 32
lvx v6, 0, r9 ;# v6 = Hround
vspltisw v7, 14 ;# == 14, fits in 5 signed bits
addi r8, r1, 0
lwz r0, 0(r3)
two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
lwzux r0, r3, r5
two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
lvx v6, r6, r9 ;# v6 = Vround
vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
two_rows_vert v0, v1
stvx v8, 0, r4
two_rows_vert v2, v3
stvx v8, r6, r4
epilogue
blr
.data
.align 4
ppc_dctperm_tab:
.byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
.byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
.align 4
dct_tab:
.short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
.short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
.short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
.short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
.align 4
round_tab:
.long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
.long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

Просмотреть файл

@ -1,51 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl vp8_block_error_ppc
.align 2
;# r3 short *Coeff
;# r4 short *dqcoeff
vp8_block_error_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xf800
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
stw r5, 12(r1) ;# tranfer dc to vector register
lvx v0, 0, r3 ;# Coeff
lvx v1, 0, r4 ;# dqcoeff
li r10, 16
vspltisw v3, 0
vsubshs v0, v0, v1
vmsumshm v2, v0, v0, v3 ;# multiply differences
lvx v0, r10, r3 ;# Coeff
lvx v1, r10, r4 ;# dqcoeff
vsubshs v0, v0, v1
vmsumshm v1, v0, v0, v2 ;# multiply differences
vsumsws v1, v1, v3 ;# sum up
stvx v1, 0, r1
lwz r3, 12(r1) ;# return value
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr

Просмотреть файл

@ -1,277 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl vp8_sad16x16_ppc
.globl vp8_sad16x8_ppc
.globl vp8_sad8x16_ppc
.globl vp8_sad8x8_ppc
.globl vp8_sad4x4_ppc
.macro load_aligned_16 V R O
lvsl v3, 0, \R ;# permutate value for alignment
lvx v1, 0, \R
lvx v2, \O, \R
vperm \V, v1, v2, v3
.endm
.macro prologue
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xffc0
mtspr 256, r12 ;# set VRSAVE
stwu r1, -32(r1) ;# create space on the stack
li r10, 16 ;# load offset and loop counter
vspltisw v8, 0 ;# zero out total to start
.endm
.macro epilogue
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
.endm
.macro SAD_16
;# v6 = abs (v4 - v5)
vsububs v6, v4, v5
vsububs v7, v5, v4
vor v6, v6, v7
;# v8 += abs (v4 - v5)
vsum4ubs v8, v6, v8
.endm
.macro sad_16_loop loop_label
lvsl v3, 0, r5 ;# only needs to be done once per block
;# preload a line of data before getting into the loop
lvx v4, 0, r3
lvx v1, 0, r5
lvx v2, r10, r5
add r5, r5, r6
add r3, r3, r4
vperm v5, v1, v2, v3
.align 4
\loop_label:
;# compute difference on first row
vsububs v6, v4, v5
vsububs v7, v5, v4
;# load up next set of data
lvx v9, 0, r3
lvx v1, 0, r5
lvx v2, r10, r5
;# perform abs() of difference
vor v6, v6, v7
add r3, r3, r4
;# add to the running tally
vsum4ubs v8, v6, v8
;# now onto the next line
vperm v5, v1, v2, v3
add r5, r5, r6
lvx v4, 0, r3
;# compute difference on second row
vsububs v6, v9, v5
lvx v1, 0, r5
vsububs v7, v5, v9
lvx v2, r10, r5
vor v6, v6, v7
add r3, r3, r4
vsum4ubs v8, v6, v8
vperm v5, v1, v2, v3
add r5, r5, r6
bdnz \loop_label
vspltisw v7, 0
vsumsws v8, v8, v7
stvx v8, 0, r1
lwz r3, 12(r1)
.endm
.macro sad_8_loop loop_label
.align 4
\loop_label:
;# only one of the inputs should need to be aligned.
load_aligned_16 v4, r3, r10
load_aligned_16 v5, r5, r10
;# move onto the next line
add r3, r3, r4
add r5, r5, r6
;# only one of the inputs should need to be aligned.
load_aligned_16 v6, r3, r10
load_aligned_16 v7, r5, r10
;# move onto the next line
add r3, r3, r4
add r5, r5, r6
vmrghb v4, v4, v6
vmrghb v5, v5, v7
SAD_16
bdnz \loop_label
vspltisw v7, 0
vsumsws v8, v8, v7
stvx v8, 0, r1
lwz r3, 12(r1)
.endm
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_stride
;# r5 unsigned char *ref_ptr
;# r6 int ref_stride
;#
;# r3 return value
vp8_sad16x16_ppc:
prologue
li r9, 8
mtctr r9
sad_16_loop sad16x16_loop
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_stride
;# r5 unsigned char *ref_ptr
;# r6 int ref_stride
;#
;# r3 return value
vp8_sad16x8_ppc:
prologue
li r9, 4
mtctr r9
sad_16_loop sad16x8_loop
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_stride
;# r5 unsigned char *ref_ptr
;# r6 int ref_stride
;#
;# r3 return value
vp8_sad8x16_ppc:
prologue
li r9, 8
mtctr r9
sad_8_loop sad8x16_loop
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_stride
;# r5 unsigned char *ref_ptr
;# r6 int ref_stride
;#
;# r3 return value
vp8_sad8x8_ppc:
prologue
li r9, 4
mtctr r9
sad_8_loop sad8x8_loop
epilogue
blr
.macro transfer_4x4 I P
lwz r0, 0(\I)
add \I, \I, \P
lwz r7, 0(\I)
add \I, \I, \P
lwz r8, 0(\I)
add \I, \I, \P
lwz r9, 0(\I)
stw r0, 0(r1)
stw r7, 4(r1)
stw r8, 8(r1)
stw r9, 12(r1)
.endm
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_stride
;# r5 unsigned char *ref_ptr
;# r6 int ref_stride
;#
;# r3 return value
vp8_sad4x4_ppc:
prologue
transfer_4x4 r3, r4
lvx v4, 0, r1
transfer_4x4 r5, r6
lvx v5, 0, r1
vspltisw v8, 0 ;# zero out total to start
;# v6 = abs (v4 - v5)
vsububs v6, v4, v5
vsububs v7, v5, v4
vor v6, v6, v7
;# v8 += abs (v4 - v5)
vsum4ubs v7, v6, v8
vsumsws v7, v7, v8
stvx v7, 0, r1
lwz r3, 12(r1)
epilogue
blr

Просмотреть файл

@ -1,375 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl vp8_get8x8var_ppc
.globl vp8_get16x16var_ppc
.globl vp8_mse16x16_ppc
.globl vp9_variance16x16_ppc
.globl vp9_variance16x8_ppc
.globl vp9_variance8x16_ppc
.globl vp9_variance8x8_ppc
.globl vp9_variance4x4_ppc
.macro load_aligned_16 V R O
lvsl v3, 0, \R ;# permutate value for alignment
lvx v1, 0, \R
lvx v2, \O, \R
vperm \V, v1, v2, v3
.endm
.macro prologue
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xffc0
mtspr 256, r12 ;# set VRSAVE
stwu r1, -32(r1) ;# create space on the stack
li r10, 16 ;# load offset and loop counter
vspltisw v7, 0 ;# zero for merging
vspltisw v8, 0 ;# zero out total to start
vspltisw v9, 0 ;# zero out total for dif^2
.endm
.macro epilogue
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
.endm
.macro compute_sum_sse
;# Compute sum first. Unpack to so signed subract
;# can be used. Only have a half word signed
;# subract. Do high, then low.
vmrghb v2, v7, v4
vmrghb v3, v7, v5
vsubshs v2, v2, v3
vsum4shs v8, v2, v8
vmrglb v2, v7, v4
vmrglb v3, v7, v5
vsubshs v2, v2, v3
vsum4shs v8, v2, v8
;# Now compute sse.
vsububs v2, v4, v5
vsububs v3, v5, v4
vor v2, v2, v3
vmsumubm v9, v2, v2, v9
.endm
.macro variance_16 DS loop_label store_sum
\loop_label:
;# only one of the inputs should need to be aligned.
load_aligned_16 v4, r3, r10
load_aligned_16 v5, r5, r10
;# move onto the next line
add r3, r3, r4
add r5, r5, r6
compute_sum_sse
bdnz \loop_label
vsumsws v8, v8, v7
vsumsws v9, v9, v7
stvx v8, 0, r1
lwz r3, 12(r1)
stvx v9, 0, r1
lwz r4, 12(r1)
.if \store_sum
stw r3, 0(r8) ;# sum
.endif
stw r4, 0(r7) ;# sse
mullw r3, r3, r3 ;# sum*sum
srawi r3, r3, \DS ;# (sum*sum) >> DS
subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
.endm
.macro variance_8 DS loop_label store_sum
\loop_label:
;# only one of the inputs should need to be aligned.
load_aligned_16 v4, r3, r10
load_aligned_16 v5, r5, r10
;# move onto the next line
add r3, r3, r4
add r5, r5, r6
;# only one of the inputs should need to be aligned.
load_aligned_16 v6, r3, r10
load_aligned_16 v0, r5, r10
;# move onto the next line
add r3, r3, r4
add r5, r5, r6
vmrghb v4, v4, v6
vmrghb v5, v5, v0
compute_sum_sse
bdnz \loop_label
vsumsws v8, v8, v7
vsumsws v9, v9, v7
stvx v8, 0, r1
lwz r3, 12(r1)
stvx v9, 0, r1
lwz r4, 12(r1)
.if \store_sum
stw r3, 0(r8) ;# sum
.endif
stw r4, 0(r7) ;# sse
mullw r3, r3, r3 ;# sum*sum
srawi r3, r3, \DS ;# (sum*sum) >> 8
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
.endm
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *SSE
;# r8 int *Sum
;#
;# r3 return value
vp8_get8x8var_ppc:
prologue
li r9, 4
mtctr r9
variance_8 6, get8x8var_loop, 1
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *SSE
;# r8 int *Sum
;#
;# r3 return value
vp8_get16x16var_ppc:
prologue
mtctr r10
variance_16 8, get16x16var_loop, 1
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *sse
;#
;# r 3 return value
vp8_mse16x16_ppc:
prologue
mtctr r10
mse16x16_loop:
;# only one of the inputs should need to be aligned.
load_aligned_16 v4, r3, r10
load_aligned_16 v5, r5, r10
;# move onto the next line
add r3, r3, r4
add r5, r5, r6
;# Now compute sse.
vsububs v2, v4, v5
vsububs v3, v5, v4
vor v2, v2, v3
vmsumubm v9, v2, v2, v9
bdnz mse16x16_loop
vsumsws v9, v9, v7
stvx v9, 0, r1
lwz r3, 12(r1)
stvx v9, 0, r1
lwz r3, 12(r1)
stw r3, 0(r7) ;# sse
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *sse
;#
;# r3 return value
vp9_variance16x16_ppc:
prologue
mtctr r10
variance_16 8, variance16x16_loop, 0
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *sse
;#
;# r3 return value
vp9_variance16x8_ppc:
prologue
li r9, 8
mtctr r9
variance_16 7, variance16x8_loop, 0
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *sse
;#
;# r3 return value
vp9_variance8x16_ppc:
prologue
li r9, 8
mtctr r9
variance_8 7, variance8x16_loop, 0
epilogue
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *sse
;#
;# r3 return value
vp9_variance8x8_ppc:
prologue
li r9, 4
mtctr r9
variance_8 6, variance8x8_loop, 0
epilogue
blr
.macro transfer_4x4 I P
lwz r0, 0(\I)
add \I, \I, \P
lwz r10,0(\I)
add \I, \I, \P
lwz r8, 0(\I)
add \I, \I, \P
lwz r9, 0(\I)
stw r0, 0(r1)
stw r10, 4(r1)
stw r8, 8(r1)
stw r9, 12(r1)
.endm
.align 2
;# r3 unsigned char *src_ptr
;# r4 int source_stride
;# r5 unsigned char *ref_ptr
;# r6 int recon_stride
;# r7 unsigned int *sse
;#
;# r3 return value
vp9_variance4x4_ppc:
prologue
transfer_4x4 r3, r4
lvx v4, 0, r1
transfer_4x4 r5, r6
lvx v5, 0, r1
compute_sum_sse
vsumsws v8, v8, v7
vsumsws v9, v9, v7
stvx v8, 0, r1
lwz r3, 12(r1)
stvx v9, 0, r1
lwz r4, 12(r1)
stw r4, 0(r7) ;# sse
mullw r3, r3, r3 ;# sum*sum
srawi r3, r3, 4 ;# (sum*sum) >> 4
subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
epilogue
blr

Просмотреть файл

@ -1,865 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl vp9_sub_pixel_variance4x4_ppc
.globl vp9_sub_pixel_variance8x8_ppc
.globl vp9_sub_pixel_variance8x16_ppc
.globl vp9_sub_pixel_variance16x8_ppc
.globl vp9_sub_pixel_variance16x16_ppc
.macro load_c V, LABEL, OFF, R0, R1
lis \R0, \LABEL@ha
la \R1, \LABEL@l(\R0)
lvx \V, \OFF, \R1
.endm
.macro load_vfilter V0, V1
load_c \V0, vfilter_b, r6, r12, r10
addi r6, r6, 16
lvx \V1, r6, r10
.endm
.macro HProlog jump_label
;# load up horizontal filter
slwi. r5, r5, 4 ;# index into horizontal filter array
;# index to the next set of vectors in the row.
li r10, 16
;# downshift by 7 ( divide by 128 ) at the end
vspltish v19, 7
;# If there isn't any filtering to be done for the horizontal, then
;# just skip to the second pass.
beq \jump_label
load_c v20, hfilter_b, r5, r12, r0
;# setup constants
;# v14 permutation value for alignment
load_c v28, b_hperm_b, 0, r12, r0
;# index to the next set of vectors in the row.
li r12, 32
;# rounding added in on the multiply
vspltisw v21, 8
vspltisw v18, 3
vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
slwi. r6, r6, 5 ;# index into vertical filter array
.endm
;# Filters a horizontal line
;# expects:
;# r3 src_ptr
;# r4 pitch
;# r10 16
;# r12 32
;# v17 perm intput
;# v18 rounding
;# v19 shift
;# v20 filter taps
;# v21 tmp
;# v22 tmp
;# v23 tmp
;# v24 tmp
;# v25 tmp
;# v26 tmp
;# v27 tmp
;# v28 perm output
;#
.macro hfilter_8 V, hp, lp, increment_counter
lvsl v17, 0, r3 ;# permutate value for alignment
;# input to filter is 9 bytes wide, output is 8 bytes.
lvx v21, 0, r3
lvx v22, r10, r3
.if \increment_counter
add r3, r3, r4
.endif
vperm v21, v21, v22, v17
vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
vmsummbm v24, v20, v24, v18
vmsummbm v25, v20, v25, v18
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
vsrh v24, v24, v19 ;# divide v0, v1 by 128
vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
.endm
.macro vfilter_16 P0 P1
vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
vadduhm v22, v18, v22
vmuloub v23, \P0, v20
vadduhm v23, v18, v23
vmuleub v24, \P1, v21
vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
vmuloub v25, \P1, v21
vadduhm v23, v23, v25 ;# Ro = odds
vsrh v22, v22, v19 ;# divide by 128
vsrh v23, v23, v19 ;# v16 v17 = evens, odds
vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
vmrglh v23, v22, v23
vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
.endm
.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
;# Compute sum first. Unpack to so signed subract
;# can be used. Only have a half word signed
;# subract. Do high, then low.
vmrghb \t1, \z0, \src
vmrghb \t2, \z0, \ref
vsubshs \t1, \t1, \t2
vsum4shs \sum, \t1, \sum
vmrglb \t1, \z0, \src
vmrglb \t2, \z0, \ref
vsubshs \t1, \t1, \t2
vsum4shs \sum, \t1, \sum
;# Now compute sse.
vsububs \t1, \src, \ref
vsububs \t2, \ref, \src
vor \t1, \t1, \t2
vmsumubm \sse, \t1, \t1, \sse
.endm
.macro variance_final sum, sse, z0, DS
vsumsws \sum, \sum, \z0
vsumsws \sse, \sse, \z0
stvx \sum, 0, r1
lwz r3, 12(r1)
stvx \sse, 0, r1
lwz r4, 12(r1)
stw r4, 0(r9) ;# sse
mullw r3, r3, r3 ;# sum*sum
srawi r3, r3, \DS ;# (sum*sum) >> 8
subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
.endm
.macro compute_sum_sse_16 V, increment_counter
load_and_align_16 v16, r7, r8, \increment_counter
compute_sum_sse \V, v16, v18, v19, v20, v21, v23
.endm
.macro load_and_align_16 V, R, P, increment_counter
lvsl v17, 0, \R ;# permutate value for alignment
;# input to filter is 21 bytes wide, output is 16 bytes.
;# input will can span three vectors if not aligned correctly.
lvx v21, 0, \R
lvx v22, r10, \R
.if \increment_counter
add \R, \R, \P
.endif
vperm \V, v21, v22, v17
.endm
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_pixels_per_line
;# r5 int xoffset
;# r6 int yoffset
;# r7 unsigned char *dst_ptr
;# r8 int dst_pixels_per_line
;# r9 unsigned int *sse
;#
;# r3 return value
vp9_sub_pixel_variance4x4_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xf830
ori r12, r12, 0xfff8
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
HProlog second_pass_4x4_pre_copy_b
;# Load up permutation constants
load_c v10, b_0123_b, 0, r12, r0
load_c v11, b_4567_b, 0, r12, r0
hfilter_8 v0, v10, v11, 1
hfilter_8 v1, v10, v11, 1
hfilter_8 v2, v10, v11, 1
hfilter_8 v3, v10, v11, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq compute_sum_sse_4x4_b
hfilter_8 v4, v10, v11, 0
b second_pass_4x4_b
second_pass_4x4_pre_copy_b:
slwi r6, r6, 5 ;# index into vertical filter array
load_and_align_16 v0, r3, r4, 1
load_and_align_16 v1, r3, r4, 1
load_and_align_16 v2, r3, r4, 1
load_and_align_16 v3, r3, r4, 1
load_and_align_16 v4, r3, r4, 0
second_pass_4x4_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
compute_sum_sse_4x4_b:
vspltish v18, 0 ;# sum
vspltish v19, 0 ;# sse
vspltish v23, 0 ;# unpack
li r10, 16
load_and_align_16 v4, r7, r8, 1
load_and_align_16 v5, r7, r8, 1
load_and_align_16 v6, r7, r8, 1
load_and_align_16 v7, r7, r8, 1
vmrghb v0, v0, v1
vmrghb v1, v2, v3
vmrghb v2, v4, v5
vmrghb v3, v6, v7
load_c v10, b_hilo_b, 0, r12, r0
vperm v0, v0, v1, v10
vperm v1, v2, v3, v10
compute_sum_sse v0, v1, v18, v19, v20, v21, v23
variance_final v18, v19, v23, 4
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_pixels_per_line
;# r5 int xoffset
;# r6 int yoffset
;# r7 unsigned char *dst_ptr
;# r8 int dst_pixels_per_line
;# r9 unsigned int *sse
;#
;# r3 return value
vp9_sub_pixel_variance8x8_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xfff0
ori r12, r12, 0xffff
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
HProlog second_pass_8x8_pre_copy_b
;# Load up permutation constants
load_c v10, b_0123_b, 0, r12, r0
load_c v11, b_4567_b, 0, r12, r0
hfilter_8 v0, v10, v11, 1
hfilter_8 v1, v10, v11, 1
hfilter_8 v2, v10, v11, 1
hfilter_8 v3, v10, v11, 1
hfilter_8 v4, v10, v11, 1
hfilter_8 v5, v10, v11, 1
hfilter_8 v6, v10, v11, 1
hfilter_8 v7, v10, v11, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq compute_sum_sse_8x8_b
hfilter_8 v8, v10, v11, 0
b second_pass_8x8_b
second_pass_8x8_pre_copy_b:
slwi. r6, r6, 5 ;# index into vertical filter array
load_and_align_16 v0, r3, r4, 1
load_and_align_16 v1, r3, r4, 1
load_and_align_16 v2, r3, r4, 1
load_and_align_16 v3, r3, r4, 1
load_and_align_16 v4, r3, r4, 1
load_and_align_16 v5, r3, r4, 1
load_and_align_16 v6, r3, r4, 1
load_and_align_16 v7, r3, r4, 1
load_and_align_16 v8, r3, r4, 0
beq compute_sum_sse_8x8_b
second_pass_8x8_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
vfilter_16 v4, v5
vfilter_16 v5, v6
vfilter_16 v6, v7
vfilter_16 v7, v8
compute_sum_sse_8x8_b:
vspltish v18, 0 ;# sum
vspltish v19, 0 ;# sse
vspltish v23, 0 ;# unpack
li r10, 16
vmrghb v0, v0, v1
vmrghb v1, v2, v3
vmrghb v2, v4, v5
vmrghb v3, v6, v7
load_and_align_16 v4, r7, r8, 1
load_and_align_16 v5, r7, r8, 1
load_and_align_16 v6, r7, r8, 1
load_and_align_16 v7, r7, r8, 1
load_and_align_16 v8, r7, r8, 1
load_and_align_16 v9, r7, r8, 1
load_and_align_16 v10, r7, r8, 1
load_and_align_16 v11, r7, r8, 0
vmrghb v4, v4, v5
vmrghb v5, v6, v7
vmrghb v6, v8, v9
vmrghb v7, v10, v11
compute_sum_sse v0, v4, v18, v19, v20, v21, v23
compute_sum_sse v1, v5, v18, v19, v20, v21, v23
compute_sum_sse v2, v6, v18, v19, v20, v21, v23
compute_sum_sse v3, v7, v18, v19, v20, v21, v23
variance_final v18, v19, v23, 6
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_pixels_per_line
;# r5 int xoffset
;# r6 int yoffset
;# r7 unsigned char *dst_ptr
;# r8 int dst_pixels_per_line
;# r9 unsigned int *sse
;#
;# r3 return value
vp9_sub_pixel_variance8x16_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xffff
ori r12, r12, 0xfffc
mtspr 256, r12 ;# set VRSAVE
stwu r1,-32(r1) ;# create space on the stack
HProlog second_pass_8x16_pre_copy_b
;# Load up permutation constants
load_c v29, b_0123_b, 0, r12, r0
load_c v30, b_4567_b, 0, r12, r0
hfilter_8 v0, v29, v30, 1
hfilter_8 v1, v29, v30, 1
hfilter_8 v2, v29, v30, 1
hfilter_8 v3, v29, v30, 1
hfilter_8 v4, v29, v30, 1
hfilter_8 v5, v29, v30, 1
hfilter_8 v6, v29, v30, 1
hfilter_8 v7, v29, v30, 1
hfilter_8 v8, v29, v30, 1
hfilter_8 v9, v29, v30, 1
hfilter_8 v10, v29, v30, 1
hfilter_8 v11, v29, v30, 1
hfilter_8 v12, v29, v30, 1
hfilter_8 v13, v29, v30, 1
hfilter_8 v14, v29, v30, 1
hfilter_8 v15, v29, v30, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq compute_sum_sse_8x16_b
hfilter_8 v16, v29, v30, 0
b second_pass_8x16_b
second_pass_8x16_pre_copy_b:
slwi. r6, r6, 5 ;# index into vertical filter array
load_and_align_16 v0, r3, r4, 1
load_and_align_16 v1, r3, r4, 1
load_and_align_16 v2, r3, r4, 1
load_and_align_16 v3, r3, r4, 1
load_and_align_16 v4, r3, r4, 1
load_and_align_16 v5, r3, r4, 1
load_and_align_16 v6, r3, r4, 1
load_and_align_16 v7, r3, r4, 1
load_and_align_16 v8, r3, r4, 1
load_and_align_16 v9, r3, r4, 1
load_and_align_16 v10, r3, r4, 1
load_and_align_16 v11, r3, r4, 1
load_and_align_16 v12, r3, r4, 1
load_and_align_16 v13, r3, r4, 1
load_and_align_16 v14, r3, r4, 1
load_and_align_16 v15, r3, r4, 1
load_and_align_16 v16, r3, r4, 0
beq compute_sum_sse_8x16_b
second_pass_8x16_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
vfilter_16 v4, v5
vfilter_16 v5, v6
vfilter_16 v6, v7
vfilter_16 v7, v8
vfilter_16 v8, v9
vfilter_16 v9, v10
vfilter_16 v10, v11
vfilter_16 v11, v12
vfilter_16 v12, v13
vfilter_16 v13, v14
vfilter_16 v14, v15
vfilter_16 v15, v16
compute_sum_sse_8x16_b:
vspltish v18, 0 ;# sum
vspltish v19, 0 ;# sse
vspltish v23, 0 ;# unpack
li r10, 16
vmrghb v0, v0, v1
vmrghb v1, v2, v3
vmrghb v2, v4, v5
vmrghb v3, v6, v7
vmrghb v4, v8, v9
vmrghb v5, v10, v11
vmrghb v6, v12, v13
vmrghb v7, v14, v15
load_and_align_16 v8, r7, r8, 1
load_and_align_16 v9, r7, r8, 1
load_and_align_16 v10, r7, r8, 1
load_and_align_16 v11, r7, r8, 1
load_and_align_16 v12, r7, r8, 1
load_and_align_16 v13, r7, r8, 1
load_and_align_16 v14, r7, r8, 1
load_and_align_16 v15, r7, r8, 1
vmrghb v8, v8, v9
vmrghb v9, v10, v11
vmrghb v10, v12, v13
vmrghb v11, v14, v15
compute_sum_sse v0, v8, v18, v19, v20, v21, v23
compute_sum_sse v1, v9, v18, v19, v20, v21, v23
compute_sum_sse v2, v10, v18, v19, v20, v21, v23
compute_sum_sse v3, v11, v18, v19, v20, v21, v23
load_and_align_16 v8, r7, r8, 1
load_and_align_16 v9, r7, r8, 1
load_and_align_16 v10, r7, r8, 1
load_and_align_16 v11, r7, r8, 1
load_and_align_16 v12, r7, r8, 1
load_and_align_16 v13, r7, r8, 1
load_and_align_16 v14, r7, r8, 1
load_and_align_16 v15, r7, r8, 0
vmrghb v8, v8, v9
vmrghb v9, v10, v11
vmrghb v10, v12, v13
vmrghb v11, v14, v15
compute_sum_sse v4, v8, v18, v19, v20, v21, v23
compute_sum_sse v5, v9, v18, v19, v20, v21, v23
compute_sum_sse v6, v10, v18, v19, v20, v21, v23
compute_sum_sse v7, v11, v18, v19, v20, v21, v23
variance_final v18, v19, v23, 7
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
;# Filters a horizontal line
;# expects:
;# r3 src_ptr
;# r4 pitch
;# r10 16
;# r12 32
;# v17 perm intput
;# v18 rounding
;# v19 shift
;# v20 filter taps
;# v21 tmp
;# v22 tmp
;# v23 tmp
;# v24 tmp
;# v25 tmp
;# v26 tmp
;# v27 tmp
;# v28 perm output
;#
.macro hfilter_16 V, increment_counter
lvsl v17, 0, r3 ;# permutate value for alignment
;# input to filter is 21 bytes wide, output is 16 bytes.
;# input will can span three vectors if not aligned correctly.
lvx v21, 0, r3
lvx v22, r10, r3
lvx v23, r12, r3
.if \increment_counter
add r3, r3, r4
.endif
vperm v21, v21, v22, v17
vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
;# set 0
vmsummbm v24, v20, v21, v18 ;# taps times elements
;# set 1
vsldoi v23, v21, v22, 1
vmsummbm v25, v20, v23, v18
;# set 2
vsldoi v23, v21, v22, 2
vmsummbm v26, v20, v23, v18
;# set 3
vsldoi v23, v21, v22, 3
vmsummbm v27, v20, v23, v18
vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
vsrh v24, v24, v19 ;# divide v0, v1 by 128
vsrh v25, v25, v19
vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
.endm
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_pixels_per_line
;# r5 int xoffset
;# r6 int yoffset
;# r7 unsigned char *dst_ptr
;# r8 int dst_pixels_per_line
;# r9 unsigned int *sse
;#
;# r3 return value
vp9_sub_pixel_variance16x8_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xffff
ori r12, r12, 0xfff8
mtspr 256, r12 ;# set VRSAVE
stwu r1, -32(r1) ;# create space on the stack
HProlog second_pass_16x8_pre_copy_b
hfilter_16 v0, 1
hfilter_16 v1, 1
hfilter_16 v2, 1
hfilter_16 v3, 1
hfilter_16 v4, 1
hfilter_16 v5, 1
hfilter_16 v6, 1
hfilter_16 v7, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq compute_sum_sse_16x8_b
hfilter_16 v8, 0
b second_pass_16x8_b
second_pass_16x8_pre_copy_b:
slwi. r6, r6, 5 ;# index into vertical filter array
load_and_align_16 v0, r3, r4, 1
load_and_align_16 v1, r3, r4, 1
load_and_align_16 v2, r3, r4, 1
load_and_align_16 v3, r3, r4, 1
load_and_align_16 v4, r3, r4, 1
load_and_align_16 v5, r3, r4, 1
load_and_align_16 v6, r3, r4, 1
load_and_align_16 v7, r3, r4, 1
load_and_align_16 v8, r3, r4, 1
beq compute_sum_sse_16x8_b
second_pass_16x8_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
vfilter_16 v4, v5
vfilter_16 v5, v6
vfilter_16 v6, v7
vfilter_16 v7, v8
compute_sum_sse_16x8_b:
vspltish v18, 0 ;# sum
vspltish v19, 0 ;# sse
vspltish v23, 0 ;# unpack
li r10, 16
compute_sum_sse_16 v0, 1
compute_sum_sse_16 v1, 1
compute_sum_sse_16 v2, 1
compute_sum_sse_16 v3, 1
compute_sum_sse_16 v4, 1
compute_sum_sse_16 v5, 1
compute_sum_sse_16 v6, 1
compute_sum_sse_16 v7, 0
variance_final v18, v19, v23, 7
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 2
;# r3 unsigned char *src_ptr
;# r4 int src_pixels_per_line
;# r5 int xoffset
;# r6 int yoffset
;# r7 unsigned char *dst_ptr
;# r8 int dst_pixels_per_line
;# r9 unsigned int *sse
;#
;# r3 return value
vp9_sub_pixel_variance16x16_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xffff
ori r12, r12, 0xfff8
mtspr 256, r12 ;# set VRSAVE
stwu r1, -32(r1) ;# create space on the stack
HProlog second_pass_16x16_pre_copy_b
hfilter_16 v0, 1
hfilter_16 v1, 1
hfilter_16 v2, 1
hfilter_16 v3, 1
hfilter_16 v4, 1
hfilter_16 v5, 1
hfilter_16 v6, 1
hfilter_16 v7, 1
hfilter_16 v8, 1
hfilter_16 v9, 1
hfilter_16 v10, 1
hfilter_16 v11, 1
hfilter_16 v12, 1
hfilter_16 v13, 1
hfilter_16 v14, 1
hfilter_16 v15, 1
;# Finished filtering main horizontal block. If there is no
;# vertical filtering, jump to storing the data. Otherwise
;# load up and filter the additional line that is needed
;# for the vertical filter.
beq compute_sum_sse_16x16_b
hfilter_16 v16, 0
b second_pass_16x16_b
second_pass_16x16_pre_copy_b:
slwi. r6, r6, 5 ;# index into vertical filter array
load_and_align_16 v0, r3, r4, 1
load_and_align_16 v1, r3, r4, 1
load_and_align_16 v2, r3, r4, 1
load_and_align_16 v3, r3, r4, 1
load_and_align_16 v4, r3, r4, 1
load_and_align_16 v5, r3, r4, 1
load_and_align_16 v6, r3, r4, 1
load_and_align_16 v7, r3, r4, 1
load_and_align_16 v8, r3, r4, 1
load_and_align_16 v9, r3, r4, 1
load_and_align_16 v10, r3, r4, 1
load_and_align_16 v11, r3, r4, 1
load_and_align_16 v12, r3, r4, 1
load_and_align_16 v13, r3, r4, 1
load_and_align_16 v14, r3, r4, 1
load_and_align_16 v15, r3, r4, 1
load_and_align_16 v16, r3, r4, 0
beq compute_sum_sse_16x16_b
second_pass_16x16_b:
vspltish v20, 8
vspltish v18, 3
vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
load_vfilter v20, v21
vfilter_16 v0, v1
vfilter_16 v1, v2
vfilter_16 v2, v3
vfilter_16 v3, v4
vfilter_16 v4, v5
vfilter_16 v5, v6
vfilter_16 v6, v7
vfilter_16 v7, v8
vfilter_16 v8, v9
vfilter_16 v9, v10
vfilter_16 v10, v11
vfilter_16 v11, v12
vfilter_16 v12, v13
vfilter_16 v13, v14
vfilter_16 v14, v15
vfilter_16 v15, v16
compute_sum_sse_16x16_b:
vspltish v18, 0 ;# sum
vspltish v19, 0 ;# sse
vspltish v23, 0 ;# unpack
li r10, 16
compute_sum_sse_16 v0, 1
compute_sum_sse_16 v1, 1
compute_sum_sse_16 v2, 1
compute_sum_sse_16 v3, 1
compute_sum_sse_16 v4, 1
compute_sum_sse_16 v5, 1
compute_sum_sse_16 v6, 1
compute_sum_sse_16 v7, 1
compute_sum_sse_16 v8, 1
compute_sum_sse_16 v9, 1
compute_sum_sse_16 v10, 1
compute_sum_sse_16 v11, 1
compute_sum_sse_16 v12, 1
compute_sum_sse_16 v13, 1
compute_sum_sse_16 v14, 1
compute_sum_sse_16 v15, 0
variance_final v18, v19, v23, 8
addi r1, r1, 32 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.data
.align 4
hfilter_b:
.byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
.byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
.byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
.byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
.byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
.byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
.byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
.byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
.align 4
vfilter_b:
.byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
.byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
.byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
.byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
.byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
.byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
.byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
.byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
.align 4
b_hperm_b:
.byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
.align 4
b_0123_b:
.byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
.align 4
b_4567_b:
.byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
b_hilo_b:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23