aom/vp8/common/ppc/idctllm_altivec.asm

190 строки
4.7 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
.globl short_idct4x4llm_ppc
.macro load_c V, LABEL, OFF, R0, R1
lis \R0, \LABEL@ha
la \R1, \LABEL@l(\R0)
lvx \V, \OFF, \R1
.endm
;# r3 short *input
;# r4 short *output
;# r5 int pitch
.align 2
short_idct4x4llm_ppc:
mfspr r11, 256 ;# get old VRSAVE
oris r12, r11, 0xfff8
mtspr 256, r12 ;# set VRSAVE
load_c v8, sinpi8sqrt2, 0, r9, r10
load_c v9, cospi8sqrt2minus1, 0, r9, r10
load_c v10, hi_hi, 0, r9, r10
load_c v11, lo_lo, 0, r9, r10
load_c v12, shift_16, 0, r9, r10
li r10, 16
lvx v0, 0, r3 ;# input ip[0], ip[ 4]
lvx v1, r10, r3 ;# input ip[8], ip[12]
;# first pass
vupkhsh v2, v0
vupkhsh v3, v1
vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8]
vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8]
vupklsh v0, v0
vmulosh v4, v0, v8
vsraw v4, v4, v12
vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2)
vupklsh v1, v1
vmulosh v5, v1, v9
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v1
vsubsws v4, v4, v5 ;# c1
vmulosh v3, v1, v8
vsraw v3, v3, v12
vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2)
vmulosh v5, v0, v9
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v0
vaddsws v3, v3, v5 ;# d1
vaddsws v0, v6, v3 ;# a1 + d1
vsubsws v3, v6, v3 ;# a1 - d1
vaddsws v1, v7, v4 ;# b1 + c1
vsubsws v2, v7, v4 ;# b1 - c1
;# transpose input
vmrghw v4, v0, v1 ;# a0 b0 a1 b1
vmrghw v5, v2, v3 ;# c0 d0 c1 d1
vmrglw v6, v0, v1 ;# a2 b2 a3 b3
vmrglw v7, v2, v3 ;# c2 d2 c3 d3
vperm v0, v4, v5, v10 ;# a0 b0 c0 d0
vperm v1, v4, v5, v11 ;# a1 b1 c1 d1
vperm v2, v6, v7, v10 ;# a2 b2 c2 d2
vperm v3, v6, v7, v11 ;# a3 b3 c3 d3
;# second pass
vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8]
vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8]
vmulosh v4, v1, v8
vsraw v4, v4, v12
vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2)
vmulosh v5, v3, v9
vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v3
vsubsws v4, v4, v5 ;# c1
vmulosh v2, v3, v8
vsraw v2, v2, v12
vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2)
vmulosh v5, v1, v9
vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2)
vaddsws v5, v5, v1
vaddsws v3, v2, v5 ;# d1
vaddsws v0, v6, v3 ;# a1 + d1
vsubsws v3, v6, v3 ;# a1 - d1
vaddsws v1, v7, v4 ;# b1 + c1
vsubsws v2, v7, v4 ;# b1 - c1
vspltish v6, 4
vspltish v7, 3
vpkswss v0, v0, v1
vpkswss v1, v2, v3
vaddshs v0, v0, v6
vaddshs v1, v1, v6
vsrah v0, v0, v7
vsrah v1, v1, v7
;# transpose output
vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3
vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3
vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1
vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3
stwu r1,-416(r1) ;# create space on the stack
stvx v0, 0, r1
lwz r6, 0(r1)
stw r6, 0(r4)
lwz r6, 4(r1)
stw r6, 4(r4)
add r4, r4, r5
lwz r6, 8(r1)
stw r6, 0(r4)
lwz r6, 12(r1)
stw r6, 4(r4)
add r4, r4, r5
stvx v1, 0, r1
lwz r6, 0(r1)
stw r6, 0(r4)
lwz r6, 4(r1)
stw r6, 4(r4)
add r4, r4, r5
lwz r6, 8(r1)
stw r6, 0(r4)
lwz r6, 12(r1)
stw r6, 4(r4)
addi r1, r1, 416 ;# recover stack
mtspr 256, r11 ;# reset old VRSAVE
blr
.align 4
sinpi8sqrt2:
.short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
.align 4
cospi8sqrt2minus1:
.short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
.align 4
shift_16:
.long 16, 16, 16, 16
.align 4
hi_hi:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
.align 4
lo_lo:
.byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31